%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.21", %%% date = "28 July 2015", %%% time = "17:20:30 MDT", %%% filename = "tkdd.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "42273 9977 55639 530716", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "ACM Transactions on Knowledge Discovery from %%% Data (TKDD); bibliography; TKDD", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Knowledge Discovery from %%% Data (TKDD) (CODEN ????, ISSN 1556-4681), %%% covering all journal issues from 2007 -- %%% date. %%% %%% At version 1.21, the COMPLETE journal %%% coverage looked like this: %%% %%% 2007 ( 14) 2010 ( 26) 2013 ( 20) %%% 2008 ( 18) 2011 ( 11) 2014 ( 37) %%% 2009 ( 25) 2012 ( 26) 2015 ( 31) %%% %%% Article: 208 %%% %%% Total entries: 208 %%% %%% The journal Web page can be found at: %%% %%% http://www.acm.org/pubs/tkdd.html %%% %%% The journal table of contents page is at: %%% %%% http://www.acm.org/tkdd/ %%% http://portal.acm.org/browse_dl.cfm?idx=J1054 %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty" # "\def \TM {${}^{\sc TM}$}" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb= "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TKDD= "ACM Transactions on Knowledge Discovery from Data (TKDD)"} %%% ==================================================================== %%% Bibliography entries: @Article{Han:2007:I, author = "Jiawei Han", title = "Introduction", journal = j-TKDD, volume = "1", number = "1", pages = "1:1--1:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217300", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Leskovec:2007:GED, author = "Jure Leskovec and Jon Kleinberg and Christos Faloutsos", title = "Graph evolution: {Densification} and shrinking diameters", journal = j-TKDD, volume = "1", number = "1", pages = "2:1--2:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217301", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How do real graphs evolve over time? What are normal growth patterns in social, technological, and information networks? Many studies have discovered patterns in {\em static graphs}, identifying properties in a single snapshot of a large network or in a very small number of snapshots; these include heavy tails for in- and out-degree distributions, communities, small-world phenomena, and others. However, given the lack of information about network evolution over long periods, it has been hard to convert these findings into statements about trends over time.\par Here we study a wide range of real graphs, and we observe some surprising phenomena. First, most of these graphs densify over time with the number of edges growing superlinearly in the number of nodes. Second, the average distance between nodes often shrinks over time in contrast to the conventional wisdom that such distance parameters should increase slowly as a function of the number of nodes (like $O(\log n)$ or $O(\log(\log n))$).\par Existing graph generation models do not exhibit these types of behavior even at a qualitative level. We provide a new graph generator, based on a forest fire spreading process that has a simple, intuitive justification, requires very few parameters (like the flammability of nodes), and produces graphs exhibiting the full range of properties observed both in prior work and in the present study.\par We also notice that the forest fire model exhibits a sharp transition between sparse graphs and graphs that are densifying. Graphs with decreasing distance between the nodes are generated around this transition point.\par Last, we analyze the connection between the temporal evolution of the degree distribution and densification of a graph. We find that the two are fundamentally related. We also observe that real networks exhibit this type of relation between densification and the degree distribution.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Densification power laws; graph generators; graph mining; heavy-tailed distributions; small-world phenomena", } @Article{Machanavajjhala:2007:DPB, author = "Ashwin Machanavajjhala and Daniel Kifer and Johannes Gehrke and Muthuramakrishnan Venkitasubramaniam", title = "{{$L$}}-diversity: {Privacy} beyond $k$-anonymity", journal = j-TKDD, volume = "1", number = "1", pages = "3:1--3:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217302", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Publishing data about individuals without revealing sensitive information about them is an important problem. In recent years, a new definition of privacy called $k$-anonymity has gained popularity. In a $k$-anonymized dataset, each record is indistinguishable from at least $k - 1$ other records with respect to certain identifying attributes.\par In this article, we show using two simple attacks that a $k$-anonymized dataset has some subtle but severe privacy problems. First, an attacker can discover the values of sensitive attributes when there is little diversity in those sensitive attributes. This is a known problem. Second, attackers often have background knowledge, and we show that $k$-anonymity does not guarantee privacy against attackers using background knowledge. We give a detailed analysis of these two attacks, and we propose a novel and powerful privacy criterion called $\ell$-diversity that can defend against such attacks. In addition to building a formal foundation for $\ell$-diversity, we show in an experimental evaluation that $\ell$-diversity is practical and can be implemented efficiently.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "-diversity; Data privacy; ell-k-anonymity; privacy-preserving data publishing", } @Article{Gionis:2007:CA, author = "Aristides Gionis and Heikki Mannila and Panayiotis Tsaparas", title = "Clustering aggregation", journal = j-TKDD, volume = "1", number = "1", pages = "4:1--4:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217303", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We consider the following problem: given a set of clusterings, find a single clustering that agrees as much as possible with the input clusterings. This problem, {\em clustering aggregation}, appears naturally in various contexts. For example, clustering categorical data is an instance of the clustering aggregation problem; each categorical attribute can be viewed as a clustering of the input rows where rows are grouped together if they take the same value on that attribute. Clustering aggregation can also be used as a metaclustering method to improve the robustness of clustering by combining the output of multiple algorithms. Furthermore, the problem formulation does not require a priori information about the number of clusters; it is naturally determined by the optimization function.\par In this article, we give a formal statement of the clustering aggregation problem, and we propose a number of algorithms. Our algorithms make use of the connection between clustering aggregation and the problem of {\em correlation clustering}. Although the problems we consider are NP-hard, for several of our methods, we provide theoretical guarantees on the quality of the solutions. Our work provides the best deterministic approximation algorithm for the variation of the correlation clustering problem we consider. We also show how sampling can be used to scale the algorithms for large datasets. We give an extensive empirical evaluation demonstrating the usefulness of the problem and of the solutions.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "clustering aggregation; clustering categorical data; correlation clustering; Data clustering", } @Article{Bhattacharya:2007:CER, author = "Indrajit Bhattacharya and Lise Getoor", title = "Collective entity resolution in relational data", journal = j-TKDD, volume = "1", number = "1", pages = "5:1--5:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217304", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Many databases contain uncertain and imprecise references to real-world entities. The absence of identifiers for the underlying entities often results in a database which contains multiple references to the same entity. This can lead not only to data redundancy, but also inaccuracies in query processing and knowledge extraction. These problems can be alleviated through the use of {\em entity resolution}. Entity resolution involves discovering the underlying entities and mapping each database reference to these entities. Traditionally, entities are resolved using pairwise similarity over the attributes of references. However, there is often additional relational information in the data. Specifically, references to different entities may cooccur. In these cases, collective entity resolution, in which entities for cooccurring references are determined jointly rather than independently, can improve entity resolution accuracy. We propose a novel relational clustering algorithm that uses both attribute and relational information for determining the underlying domain entities, and we give an efficient implementation. We investigate the impact that different relational similarity measures have on entity resolution quality. We evaluate our collective entity resolution algorithm on multiple real-world databases. We show that it improves entity resolution performance over both attribute-based baselines and over algorithms that consider relational information but do not resolve entities collectively. In addition, we perform detailed experiments on synthetically generated data to identify data characteristics that favor collective relational resolution over purely attribute-based algorithms.", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "data cleaning; Entity resolution; graph clustering; record linkage", } @Article{Loh:2007:EEL, author = "Wei-Yin Loh and Chien-Wei Chen and Wei Zheng", title = "Extrapolation errors in linear model trees", journal = j-TKDD, volume = "1", number = "2", pages = "6:1--6:??", month = aug, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1267066.1267067", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:48 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Prediction errors from a linear model tend to be larger when extrapolation is involved, particularly when the model is wrong. This article considers the problem of extrapolation and interpolation errors when a linear model tree is used for prediction. It proposes several ways to curtail the size of the errors, and uses a large collection of real datasets to demonstrate that the solutions are effective in reducing the average mean squared prediction error. The article also provides a proof that, if a linear model is correct, the proposed solutions have no undesirable effects as the training sample size tends to infinity.", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Decision tree; prediction; regression; statistics", } @Article{Zhang:2007:MPP, author = "Minghua Zhang and Ben Kao and David W. Cheung and Kevin Y. Yip", title = "Mining periodic patterns with gap requirement from sequences", journal = j-TKDD, volume = "1", number = "2", pages = "7:1--7:??", month = aug, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1267066.1267068", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:48 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study a problem of mining frequently occurring periodic patterns with a gap requirement from sequences. Given a character sequence $S$ of length $L$ and a pattern $P$ of length $l$, we consider $P$ a frequently occurring pattern in $S$ if the probability of {\em observing\/} $P$ given a randomly picked length-$l$ subsequence of $S$ exceeds a certain threshold. In many applications, particularly those related to bioinformatics, interesting patterns are {\em periodic\/} with a {\em gap requirement}. That is to say, the characters in $P$ should match subsequences of $S$ in such a way that the matching characters in $S$ are separated by gaps of more or less the same size. We show the complexity of the mining problem and discuss why traditional mining algorithms are computationally infeasible. We propose practical algorithms for solving the problem and study their characteristics. We also present a case study in which we apply our algorithms on some DNA sequences. We discuss some interesting patterns obtained from the case study.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "gap requirement; periodic pattern; Sequence mining", } @Article{Huang:2007:TTE, author = "Jen-Wei Huang and Bi-Ru Dai and Ming-Syan Chen", title = "{Twain}: {Two-end} association miner with precise frequent exhibition periods", journal = j-TKDD, volume = "1", number = "2", pages = "8:1--8:??", month = aug, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1267066.1267069", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:48 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We investigate the general model of mining associations in a temporal database, where the exhibition periods of items are allowed to be different from one to another. The database is divided into partitions according to the time granularity imposed. Such temporal association rules allow us to observe short-term but interesting patterns that are absent when the whole range of the database is evaluated altogether. Prior work may omit some temporal association rules and thus have limited practicability. To remedy this and to give more precise frequent exhibition periods of frequent temporal itemsets, we devise an efficient algorithm {\em Twain\/} (standing for {\em TWo end AssocIation miNer\/} .) {\em Twain\/} not only generates frequent patterns with more precise frequent exhibition periods, but also discovers more interesting frequent patterns. {\em Twain\/} employs Start time and End time of each item to provide precise frequent exhibition period while progressively handling itemsets from one partition to another. Along with one scan of the database, {\em Twain\/} can generate frequent 2-itemsets directly according to the cumulative filtering threshold. Then, {\em Twain\/} adopts the scan reduction technique to generate all frequent $k$-itemsets ($k$ > 2) from the generated frequent 2-itemsets. Theoretical properties of {\em Twain\/} are derived as well in this article. The experimental results show that {\em Twain\/} outperforms the prior works in the quality of frequent patterns, execution time, I/O cost, CPU overhead and scalability.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Association; temporal", } @Article{Bayardop:2007:ISI, author = "Roberto Bayardop and Kristin P. Bennett and Gautam Das and Dimitrios Gunopulos and Johannes Gunopulos", title = "Introduction to special issue {ACM SIGKDD 2006}", journal = j-TKDD, volume = "1", number = "3", pages = "9:1--9:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297333", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bohm:2007:RPF, author = "Christian B{\"o}hm and Christos Faloutsos and Jia-Yu Pan and Claudia Plant", title = "{RIC}: {Parameter-free} noise-robust clustering", journal = j-TKDD, volume = "1", number = "3", pages = "10:1--10:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297334", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How do we find a {\em natural\/} clustering of a real-world point set which contains an unknown number of clusters with different shapes, and which may be contaminated by noise? As most clustering algorithms were designed with certain assumptions (Gaussianity), they often require the user to give input parameters, and are sensitive to noise. In this article, we propose a robust framework for determining a natural clustering of a given dataset, based on the minimum description length (MDL) principle. The proposed framework, {\em robust information-theoretic clustering (RIC)}, is orthogonal to any known clustering algorithm: Given a preliminary clustering, RIC purifies these clusters from noise, and adjusts the clusterings such that it simultaneously determines the most natural amount and shape (subspace) of the clusters. Our RIC method can be combined with any clustering technique ranging from K-means and K-medoids to advanced methods such as spectral clustering. In fact, RIC is even able to purify and improve an initial coarse clustering, even if we start with very simple methods. In an extension, we propose a fully automatic stand-alone clustering method and efficiency improvements. RIC scales well with the dataset size. Extensive experiments on synthetic and real-world datasets validate the proposed RIC framework.", acknowledgement = ack-nhfb, articleno = "10", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Clustering; data summarization; noise robustness; parameter-free data mining", } @Article{Mei:2007:SAF, author = "Qiaozhu Mei and Dong Xin and Hong Cheng and Jiawei Han and Chengxiang Zhai", title = "Semantic annotation of frequent patterns", journal = j-TKDD, volume = "1", number = "3", pages = "11:1--11:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297335", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Using frequent patterns to analyze data has been one of the fundamental approaches in many data mining applications. Research in frequent pattern mining has so far mostly focused on developing efficient algorithms to discover various kinds of frequent patterns, but little attention has been paid to the important next step --- interpreting the discovered frequent patterns. Although the compression and summarization of frequent patterns has been studied in some recent work, the proposed techniques there can only annotate a frequent pattern with nonsemantical information (e.g., support), which provides only limited help for a user to understand the patterns.\par In this article, we study the novel problem of generating semantic annotations for frequent patterns. The goal is to discover the hidden meanings of a frequent pattern by annotating it with in-depth, concise, and structured information. We propose a general approach to generate such an annotation for a frequent pattern by constructing its context model, selecting informative context indicators, and extracting representative transactions and semantically similar patterns. This general approach can well incorporate the user's prior knowledge, and has potentially many applications, such as generating a dictionary-like description for a pattern, finding synonym patterns, discovering semantic relations, and summarizing semantic classes of a set of frequent patterns. Experiments on different datasets show that our approach is effective in generating semantic pattern annotations.", acknowledgement = ack-nhfb, articleno = "11", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Frequent pattern; pattern annotation; pattern context; pattern semantic analysis", } @Article{Koren:2007:MEP, author = "Yehuda Koren and Stephen C. North and Chris Volinsky", title = "Measuring and extracting proximity graphs in networks", journal = j-TKDD, volume = "1", number = "3", pages = "12:1--12:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297336", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Measuring distance or some other form of proximity between objects is a standard data mining tool. Connection subgraphs were recently proposed as a way to demonstrate proximity between nodes in networks. We propose a new way of measuring and extracting proximity in networks called ``cycle-free effective conductance'' (CFEC). Importantly, the measured proximity is accompanied with a {\em proximity subgraph\/} which allows assessing and understanding measured values. Our proximity calculation can handle more than two endpoints, directed edges, is statistically well behaved, and produces an effectiveness score for the computed subgraphs. We provide an efficient algorithm to measure and extract proximity. Also, we report experimental results and show examples for four large network datasets: a telecommunications calling graph, the IMDB actors graph, an academic coauthorship network, and a movie recommendation system.", acknowledgement = ack-nhfb, articleno = "12", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Connection subgraph; cycle-free escape probability; escape probability; graph mining; proximity; proximity subgraph; random walk", } @Article{Ihler:2007:LDE, author = "Alexander Ihler and Jon Hutchins and Padhraic Smyth", title = "Learning to detect events with {Markov}-modulated {Poisson} processes", journal = j-TKDD, volume = "1", number = "3", pages = "13:1--13:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297337", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Time-series of count data occur in many different contexts, including Internet navigation logs, freeway traffic monitoring, and security logs associated with buildings. In this article we describe a framework for detecting anomalous events in such data using an unsupervised learning approach. Normal periodic behavior is modeled via a time-varying Poisson process model, which in turn is modulated by a hidden Markov process that accounts for bursty events. We outline a Bayesian framework for learning the parameters of this model from count time-series. Two large real-world datasets of time-series counts are used as testbeds to validate the approach, consisting of freeway traffic data and logs of people entering and exiting a building. We show that the proposed model is significantly more accurate at detecting known events than a more traditional threshold-based technique. We also describe how the model can be used to investigate different degrees of periodicity in the data, including systematic day-of-week and time-of-day effects, and to make inferences about different aspects of events such as number of vehicles or people involved. The results indicate that the Markov-modulated Poisson framework provides a robust and accurate framework for adaptively and autonomously learning how to separate unusual bursty events from traces of normal human activity.", acknowledgement = ack-nhfb, articleno = "13", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Event detection; Markov modulated; Poisson", } @Article{Gionis:2007:ADM, author = "Aristides Gionis and Heikki Mannila and Taneli Mielik{\"a}inen and Panayiotis Tsaparas", title = "Assessing data mining results via swap randomization", journal = j-TKDD, volume = "1", number = "3", pages = "14:1--14:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297338", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The problem of assessing the significance of data mining results on high-dimensional 0--1 datasets has been studied extensively in the literature. For problems such as mining frequent sets and finding correlations, significance testing can be done by standard statistical tests such as chi-square, or other methods. However, the results of such tests depend only on the specific attributes and not on the dataset as a whole. Moreover, the tests are difficult to apply to sets of patterns or other complex results of data mining algorithms. In this article, we consider a simple randomization technique that deals with this shortcoming. The approach consists of producing random datasets that have the same row and column margins as the given dataset, computing the results of interest on the randomized instances and comparing them to the results on the actual data. This randomization technique can be used to assess the results of many different types of data mining algorithms, such as frequent sets, clustering, and spectral analysis. To generate random datasets with given margins, we use variations of a Markov chain approach which is based on a simple swap operation. We give theoretical results on the efficiency of different randomization methods, and apply the swap randomization method to several well-known datasets. Our results indicate that for some datasets the structure discovered by the data mining algorithms is expected, given the row and column margins of the datasets, while for other datasets the discovered structure conveys information that is not captured by the margin counts.", acknowledgement = ack-nhfb, articleno = "14", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "0--1 data; randomization tests; Significance testing; swaps", } @Article{Tang:2008:TTA, author = "Lei Tang and Huan Liu and Jianping Zhang and Nitin Agarwal and John J. Salerno", title = "Topic taxonomy adaptation for group profiling", journal = j-TKDD, volume = "1", number = "4", pages = "1:1--1:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324173", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A topic taxonomy is an effective representation that describes salient features of virtual groups or online communities. A topic taxonomy consists of topic nodes. Each internal node is defined by its vertical path (i.e., ancestor and child nodes) and its horizontal list of attributes (or terms). In a text-dominant environment, a topic taxonomy can be used to flexibly describe a group's interests with varying granularity. However, the stagnant nature of a taxonomy may fail to timely capture the dynamic change of a group's interest. This article addresses the problem of how to adapt a topic taxonomy to the accumulated data that reflects the change of a group's interest to achieve dynamic group profiling. We first discuss the issues related to topic taxonomy. We next formulate taxonomy adaptation as an optimization problem to find the taxonomy that best fits the data. We then present a viable algorithm that can efficiently accomplish taxonomy adaptation. We conduct extensive experiments to evaluate our approach's efficacy for group profiling, compare the approach with some alternatives, and study its performance for dynamic group profiling. While pointing out various applications of taxonomy adaption, we suggest some future work that can take advantage of burgeoning Web 2.0 services for online targeted marketing, counterterrorism in connecting dots, and community tracking.", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "dynamic profiling; group interest; taxonomy adjustment; text hierarchical classification; Topic taxonomy", } @Article{Cormode:2008:FHH, author = "Graham Cormode and Flip Korn and S. Muthukrishnan and Divesh Srivastava", title = "Finding hierarchical heavy hitters in streaming data", journal = j-TKDD, volume = "1", number = "4", pages = "2:1--2:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324174", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Data items that arrive online as streams typically have attributes which take values from one or more hierarchies (time and geographic location, source and destination IP addresses, etc.). Providing an aggregate view of such data is important for summarization, visualization, and analysis. We develop an aggregate view based on certain organized sets of large-valued regions (``heavy hitters'') corresponding to hierarchically discounted frequency counts. We formally define the notion of {\em hierarchical heavy hitters\/} (HHHs). We first consider computing (approximate) HHHs over a data stream drawn from a single hierarchical attribute. We formalize the problem and give deterministic algorithms to find them in a single pass over the input.\par In order to analyze a wider range of realistic data streams (e.g., from IP traffic-monitoring applications), we generalize this problem to multiple dimensions. Here, the semantics of HHHs are more complex, since a ``child'' node can have multiple ``parent'' nodes. We present online algorithms that find approximate HHHs in one pass, with provable accuracy guarantees. The product of hierarchical dimensions forms a mathematical lattice structure. Our algorithms exploit this structure, and so are able to track approximate HHHs using only a small, fixed number of statistics per stored item, regardless of the number of dimensions.\par We show experimentally, using real data, that our proposed algorithms yields outputs which are very similar (virtually identical, in many cases) to offline computations of the exact solutions, whereas straightforward heavy-hitters-based approaches give significantly inferior answer quality. Furthermore, the proposed algorithms result in an order of magnitude savings in data structure size while performing competitively.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "approximation algorithms; Data mining; network data analysis", } @Article{Somaiya:2008:LCU, author = "Manas Somaiya and Christopher Jermaine and Sanjay Ranka", title = "Learning correlations using the mixture-of-subsets model", journal = j-TKDD, volume = "1", number = "4", pages = "3:1--3:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324175", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Using a mixture of random variables to model data is a tried-and-tested method common in data mining, machine learning, and statistics. By using mixture modeling it is often possible to accurately model even complex, multimodal data via very simple components. However, the classical mixture model assumes that a data point is generated by a single component in the model. A lot of datasets can be modeled closer to the underlying reality if we drop this restriction. We propose a probabilistic framework, the {\em mixture-of-subsets (MOS) model}, by making two fundamental changes to the classical mixture model. First, we allow a data point to be generated by a set of components, rather than just a single component. Next, we limit the number of data attributes that each component can influence. We also propose an EM framework to learn the MOS model from a dataset, and experimentally evaluate it on real, high-dimensional datasets. Our results show that the MOS model learned from the data represents the underlying nature of the data accurately.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "EM algorithm; high-dimensional data; Mixture modeling", } @Article{Halkidi:2008:CFB, author = "M. Halkidi and D. Gunopulos and M. Vazirgiannis and N. Kumar and C. Domeniconi", title = "A clustering framework based on subjective and objective validity criteria", journal = j-TKDD, volume = "1", number = "4", pages = "4:1--4:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324176", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Clustering, as an unsupervised learning process is a challenging problem, especially in cases of high-dimensional datasets. Clustering result quality can benefit from user constraints and objective validity assessment. In this article, we propose a semisupervised framework for learning the weighted Euclidean subspace, where the best clustering can be achieved. Our approach capitalizes on: (i) user constraints; and (ii) the quality of intermediate clustering results in terms of their structural properties. The proposed framework uses the clustering algorithm and the validity measure as its parameters. We develop and discuss algorithms for learning and tuning the weights of contributing dimensions and defining the ``best'' clustering obtained by satisfying user constraints. Experimental results on benchmark datasets demonstrate the superiority of the proposed approach in terms of improved clustering accuracy.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "cluster validity; data mining; Semisupervised learning; similarity measure learning; space learning", } @Article{Zaki:2008:ISI, author = "Mohammed J. Zaki and George Karypis and Jiong Yang and Wei Wang", title = "Introduction to special issue on bioinformatics", journal = j-TKDD, volume = "2", number = "1", pages = "1:1--1:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342321", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jin:2008:CMM, author = "Ying Jin and T. M. Murali and Naren Ramakrishnan", title = "Compositional mining of multirelational biological datasets", journal = j-TKDD, volume = "2", number = "1", pages = "2:1--2:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342322", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "High-throughput biological screens are yielding ever-growing streams of information about multiple aspects of cellular activity. As more and more categories of datasets come online, there is a corresponding multitude of ways in which inferences can be chained across them, motivating the need for compositional data mining algorithms. In this article, we argue that such compositional data mining can be effectively realized by functionally cascading redescription mining and biclustering algorithms as primitives. Both these primitives mirror shifts of vocabulary that can be composed in arbitrary ways to create rich chains of inferences. Given a relational database and its schema, we show how the schema can be automatically compiled into a compositional data mining program, and how different domains in the schema can be related through logical sequences of biclustering and redescription invocations. This feature allows us to rapidly prototype new data mining applications, yielding greater understanding of scientific datasets. We describe two applications of compositional data mining: (i) matching terms across categories of the Gene Ontology and (ii) understanding the molecular mechanisms underlying stress response in human cells.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Biclustering; bioinformatics; compositional data mining; inductive logic programming; redescription mining", } @Article{Sahay:2008:DSB, author = "Saurav Sahay and Sougata Mukherjea and Eugene Agichtein and Ernest V. Garcia and Shamkant B. Navathe and Ashwin Ram", title = "Discovering semantic biomedical relations utilizing the {Web}", journal = j-TKDD, volume = "2", number = "1", pages = "3:1--3:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342323", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "To realize the vision of a Semantic Web for Life Sciences, discovering relations between resources is essential. It is very difficult to automatically extract relations from Web pages expressed in natural language formats. On the other hand, because of the explosive growth of information, it is difficult to manually extract the relations. In this paper we present techniques to automatically discover relations between biomedical resources from the Web. For this purpose we retrieve relevant information from Web Search engines and Pubmed database using various lexico-syntactic patterns as queries over SOAP web services. The patterns are initially handcrafted but can be progressively learnt. The extracted relations can be used to construct and augment ontologies and knowledge bases. Experiments are presented for general biomedical relation discovery and domain specific search to show the usefulness of our technique.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Ontology construction; relation identification", } @Article{Ye:2008:DSA, author = "Jieping Ye and Jianhui Chen and Ravi Janardan and Sudhir Kumar", title = "Developmental stage annotation of {Drosophila} gene expression pattern images via an entire solution path for {LDA}", journal = j-TKDD, volume = "2", number = "1", pages = "4:1--4:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342324", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/string-matching.bib; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Gene expression in a developing embryo occurs in particular cells (spatial patterns) in a time-specific manner (temporal patterns), which leads to the differentiation of cell fates. Images of a {\em Drosophila melanogaster\/} embryo at a given developmental stage, showing a particular gene expression pattern revealed by a gene-specific probe, can be compared for spatial overlaps. The comparison is fundamentally important to formulating and testing gene interaction hypotheses. Expression pattern comparison is most biologically meaningful when images from a similar time point (developmental stage) are compared. In this paper, we present LdaPath, a novel formulation of Linear Discriminant Analysis (LDA) for automatic developmental stage range classification. It employs multivariate linear regression with the {$ L_1 $}-norm penalty controlled by a regularization parameter for feature extraction and visualization. LdaPath computes an entire solution path for all values of regularization parameter with essentially the same computational cost as fitting one LDA model. Thus, it facilitates efficient model selection. It is based on the equivalence relationship between LDA and the least squares method for multiclass classifications. This equivalence relationship is established under a mild condition, which we show empirically to hold for many high-dimensional datasets, such as expression pattern images. Our experiments on a collection of 2705 expression pattern images show the effectiveness of the proposed algorithm. Results also show that the LDA model resulting from LdaPath is sparse, and irrelevant features may be removed. Thus, LdaPath provides a general framework for simultaneous feature selection and feature extraction.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "dimensionality reduction; Gene expression pattern image; linear discriminant analysis; linear regression", } @Article{Lu:2008:ADA, author = "Yijuan Lu and Qi Tian and Jennifer Neary and Feng Liu and Yufeng Wang", title = "Adaptive discriminant analysis for microarray-based classification", journal = j-TKDD, volume = "2", number = "1", pages = "5:1--5:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342325", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Microarray technology has generated enormous amounts of high-dimensional gene expression data, providing a unique platform for exploring gene regulatory networks. However, the curse of dimensionality plagues effort to analyze these high throughput data. Linear Discriminant Analysis (LDA) and Biased Discriminant Analysis (BDA) are two popular techniques for dimension reduction, which pay attention to different roles of the positive and negative samples in finding discriminating subspace. However, the drawbacks of these two methods are obvious: LDA has limited efficiency in classifying sample data from subclasses with different distributions, and BDA does not account for the underlying distribution of negative samples.\par In this paper, we propose a novel dimension reduction technique for microarray analysis: Adaptive Discriminant Analysis (ADA), which effectively exploits favorable attributes of both BDA and LDA and avoids their unfavorable ones. ADA can find a good discriminative subspace with adaptation to different sample distributions. It not only alleviates the problem of high dimensionality, but also enhances the classification performance in the subspace with na{\"\i}ve Bayes classifier. To learn the best model fitting the real scenario, boosted Adaptive Discriminant Analysis is further proposed. Extensive experiments on the yeast cell cycle regulation data set, and the expression data of the red blood cell cycle in malaria parasite {\em Plasmodium falciparum\/} demonstrate the superior performance of ADA and boosted ADA. We also present some putative genes of specific functional classes predicted by boosted ADA. Their potential functionality is confirmed by independent predictions based on Gene Ontology, demonstrating that ADA and boosted ADA are effective dimension reduction methods for microarray-based classification.", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "ADA; BDA; boosted ADA; dimension reduction; LDA; microarray", } @Article{Hashimoto:2008:NEP, author = "Kosuke Hashimoto and Kiyoko Flora Aoki-Kinoshita and Nobuhisa Ueda and Minoru Kanehisa and Hiroshi Mamitsuka", title = "A new efficient probabilistic model for mining labeled ordered trees applied to glycobiology", journal = j-TKDD, volume = "2", number = "1", pages = "6:1--6:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342326", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Mining frequent patterns from large datasets is an important issue in data mining. Recently, complex and unstructured (or semi-structured) datasets have appeared as targets for major data mining applications, including text mining, web mining and bioinformatics. Our work focuses on labeled ordered trees, which are typically semi-structured datasets. In bioinformatics, carbohydrate sugar chains, or glycans, can be modeled as labeled ordered trees. Glycans are the third major class of biomolecules, having important roles in signaling and recognition. For mining labeled ordered trees, we propose a new probabilistic model and its efficient learning scheme which significantly improves the time and space complexity of an existing probabilistic model for labeled ordered trees. We evaluated the performance of the proposed model, comparing it with those of other probabilistic models, using synthetic as well as real datasets from glycobiology. Experimental results showed that the proposed model drastically reduced the computation time of the competing model, keeping the predictive power and avoiding overfitting to the training data. Finally, we assessed our results on real data from a variety of biological viewpoints, verifying known facts in glycobiology.", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Expectation-maximization; labeled ordered trees; maximum likelihood; probabilistic models", } @Article{Ge:2008:JCA, author = "Rong Ge and Martin Ester and Byron J. Gao and Zengjian Hu and Binay Bhattacharya and Boaz Ben-Moshe", title = "Joint cluster analysis of attribute data and relationship data: {The} connected $k$-center problem, algorithms and applications", journal = j-TKDD, volume = "2", number = "2", pages = "7:1--7:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376816", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Attribute data and relationship data are two principal types of data, representing the intrinsic and extrinsic properties of entities. While attribute data have been the main source of data for cluster analysis, relationship data such as social networks or metabolic networks are becoming increasingly available. It is also common to observe both data types carry complementary information such as in market segmentation and community identification, which calls for a joint cluster analysis of both data types so as to achieve better results. In this article, we introduce the novel Connected $k$-Center ({\em CkC\/}) problem, a clustering model taking into account attribute data as well as relationship data. We analyze the complexity of the problem and prove its NP-hardness. Therefore, we analyze the approximability of the problem and also present a constant factor approximation algorithm. For the special case of the {\em CkC\/} problem where the relationship data form a tree structure, we propose a dynamic programming method giving an optimal solution in polynomial time. We further present NetScan, a heuristic algorithm that is efficient and effective for large real databases. Our extensive experimental evaluation on real datasets demonstrates the meaningfulness and accuracy of the NetScan results.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "approximation algorithms; Attribute data; community identification; document clustering; joint cluster analysis; market segmentation; NP-hardness; relationship data", } @Article{Gupta:2008:BBC, author = "Gunjan Gupta and Joydeep Ghosh", title = "{Bregman} bubble clustering: a robust framework for mining dense clusters", journal = j-TKDD, volume = "2", number = "2", pages = "8:1--8:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376817", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In classical clustering, each data point is assigned to at least one cluster. However, in many applications only a small subset of the available data is relevant for the problem and the rest needs to be ignored in order to obtain good clusters. Certain nonparametric density-based clustering methods find the most relevant data as multiple dense regions, but such methods are generally limited to low-dimensional data and do not scale well to large, high-dimensional datasets. Also, they use a specific notion of ``distance'', typically Euclidean or Mahalanobis distance, which further limits their applicability. On the other hand, the recent One Class Information Bottleneck (OC-IB) method is fast and works on a large class of distortion measures known as Bregman Divergences, but can only find a {\em single\/} dense region. This article presents a broad framework for finding $k$ dense clusters while ignoring the rest of the data. It includes a seeding algorithm that can automatically determine a suitable value for {\em k}. When $k$ is forced to 1, our method gives rise to an improved version of OC-IB with optimality guarantees. We provide a generative model that yields the proposed iterative algorithm for finding $k$ dense regions as a special case. Our analysis reveals an interesting and novel connection between the problem of finding dense regions and exponential mixture models; a hard model corresponding to $k$ exponential mixtures with a uniform background results in a set of $k$ dense clusters. The proposed method describes a highly scalable algorithm for finding multiple dense regions that works with any Bregman Divergence, thus extending density based clustering to a variety of non-Euclidean problems not addressable by earlier methods. We present empirical results on three artificial, two microarray and one text dataset to show the relevance and effectiveness of our methods.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Bregman divergences; Density-based clustering; expectation maximization; exponential family; One Class classification", } @Article{Tan:2008:TMG, author = "Henry Tan and Fedja Hadzic and Tharam S. Dillon and Elizabeth Chang and Ling Feng", title = "Tree model guided candidate generation for mining frequent subtrees from {XML} documents", journal = j-TKDD, volume = "2", number = "2", pages = "9:1--9:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376818", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Due to the inherent flexibilities in both structure and semantics, XML association rules mining faces few challenges, such as: a more complicated hierarchical data structure and ordered data context. Mining frequent patterns from XML documents can be recast as mining frequent tree structures from a database of XML documents. In this study, we model a database of XML documents as a database of rooted labeled ordered subtrees. In particular, we are mainly concerned with mining frequent induced and embedded ordered subtrees. Our main contributions are as follows. We describe our unique {\em embedding list\/} representation of the tree structure, which enables efficient implementation of our {\em Tree Model Guided\/} ({\em TMG\/}) candidate generation. {\em TMG\/} is an optimal, nonredundant enumeration strategy that enumerates all the valid candidates that conform to the structural aspects of the data. We show through a mathematical model and experiments that {\em TMG\/} has better complexity compared to the commonly used join approach. In this article, we propose two algorithms, MB3-Miner and iMB3-Miner. MB3-Miner mines embedded subtrees. iMB3-Miner mines induced and/or embedded subtrees by using the {\em maximum level of embedding constraint}. Our experiments with both synthetic and real datasets against two well-known algorithms for mining induced and embedded subtrees, demonstrate the effectiveness and the efficiency of the proposed techniques.", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "FREQT; TMG; Tree mining; tree model guided; TreeMiner", } @Article{Islam:2008:STS, author = "Aminul Islam and Diana Inkpen", title = "Semantic text similarity using corpus-based word similarity and string similarity", journal = j-TKDD, volume = "2", number = "2", pages = "10:1--10:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376819", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present a method for measuring the semantic similarity of texts using a corpus-based measure of semantic word similarity and a normalized and modified version of the Longest Common Subsequence (LCS) string matching algorithm. Existing methods for computing text similarity have focused mainly on either large documents or individual words. We focus on computing the similarity between two sentences or two short paragraphs. The proposed method can be exploited in a variety of applications involving textual knowledge representation and knowledge discovery. Evaluation results on two different data sets show that our method outperforms several competing methods.", acknowledgement = ack-nhfb, articleno = "10", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "corpus-based measures; Semantic similarity of words; similarity of short texts", } @Article{Sun:2008:ITA, author = "Jimeng Sun and Dacheng Tao and Spiros Papadimitriou and Philip S. Yu and Christos Faloutsos", title = "Incremental tensor analysis: {Theory} and applications", journal = j-TKDD, volume = "2", number = "3", pages = "11:1--11:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409621", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How do we find patterns in author-keyword associations, evolving over time? Or in data cubes (tensors), with product-branchcustomer sales information? And more generally, how to summarize high-order data cubes (tensors)? How to incrementally update these patterns over time? Matrix decompositions, like principal component analysis (PCA) and variants, are invaluable tools for mining, dimensionality reduction, feature selection, rule identification in numerous settings like streaming data, text, graphs, social networks, and many more settings. However, they have only two orders (i.e., matrices, like author and keyword in the previous example).\par We propose to envision such higher-order data as tensors, and tap the vast literature on the topic. However, these methods do not necessarily scale up, let alone operate on semi-infinite streams. Thus, we introduce a general framework, incremental tensor analysis (ITA), which efficiently computes a compact summary for high-order and high-dimensional data, and also reveals the hidden correlations. Three variants of ITA are presented: (1) dynamic tensor analysis (DTA); (2) streaming tensor analysis (STA); and (3) window-based tensor analysis (WTA). In particular, we explore several fundamental design trade-offs such as space efficiency, computational cost, approximation accuracy, time dependency, and model complexity.\par We implement all our methods and apply them in several real settings, such as network anomaly detection, multiway latent semantic indexing on citation networks, and correlation study on sensor measurements. Our empirical studies show that the proposed methods are fast and accurate and that they find interesting patterns and outliers on the real datasets.", acknowledgement = ack-nhfb, articleno = "11", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "multilinear algebra; stream mining; Tensor", } @Article{Mangasarian:2008:PPC, author = "Olvi L. Mangasarian and Edward W. Wild and Glenn M. Fung", title = "Privacy-preserving classification of vertically partitioned data via random kernels", journal = j-TKDD, volume = "2", number = "3", pages = "12:1--12:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409622", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We propose a novel privacy-preserving support vector machine (SVM) classifier for a data matrix $A$ whose input feature columns are divided into groups belonging to different entities. Each entity is unwilling to share its group of columns or make it public. Our classifier is based on the concept of a reduced kernel $k(A, B\prime)$, where $B\prime$ is the transpose of a random matrix $B$. The column blocks of $B$ corresponding to the different entities are privately generated by each entity and never made public. The proposed linear or nonlinear SVM classifier, which is public but does not reveal any of the privately held data, has accuracy comparable to that of an ordinary SVM classifier that uses the entire set of input features directly.", acknowledgement = ack-nhfb, articleno = "12", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Privacy preserving classification; support vector machines; vertically partitioned data", } @Article{Lakshmanan:2008:DRA, author = "Laks V. S. Lakshmanan and Raymond T. Ng and Ganesh Ramesh", title = "On disclosure risk analysis of anonymized itemsets in the presence of prior knowledge", journal = j-TKDD, volume = "2", number = "3", pages = "13:1--13:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409623", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Decision makers of companies often face the dilemma of whether to release data for knowledge discovery, vis-a-vis the risk of disclosing proprietary or sensitive information. Among the various methods employed for ``sanitizing'' the data prior to disclosure, we focus in this article on anonymization, given its widespread use in practice. We do due diligence to the question ``just how safe is the anonymized data?'' We consider both those scenarios when the hacker has no information and, more realistically, when the hacker may have partial information about items in the domain. We conduct our analyses in the context of frequent set mining and address the safety question at two different levels: (i) how likely of being cracked (i.e., re-identified by a hacker), are the identities of individual items and (ii) how likely are sets of items cracked? For capturing the prior knowledge of the hacker, we propose a {\em belief function}, which amounts to an educated guess of the frequency of each item. For various classes of belief functions which correspond to different degrees of prior knowledge, we derive formulas for computing the expected number of cracks of single items and for itemsets, the probability of cracking the itemsets. While obtaining, exact values for more general situations is computationally hard, we propose a series of heuristics called the {\em O-estimates}. They are easy to compute and are shown fairly accurate, justified by empirical results on real benchmark datasets. Based on the O-estimates, we propose a recipe for the decision makers to resolve their dilemma. Our recipe operates at two different levels, depending on whether the data owner wants to reason in terms of single items or sets of items (or both). Finally, we present techniques for ascertaining a hacker's knowledge of correlation in terms of co-occurrence of items likely. This information regarding the hacker's knowledge can be incorporated into our framework of disclosure risk analysis and we present experimental results demonstrating how this knowledge affects the heuristic estimates we have developed.", acknowledgement = ack-nhfb, articleno = "13", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "anonymization; belief function; bipartite graphs; correlation; Disclosure risk; frequent itemsets; hacker; matching; prior knowledge; sampling", } @Article{Vaidya:2008:PPD, author = "Jaideep Vaidya and Chris Clifton and Murat Kantarcioglu and A. Scott Patterson", title = "Privacy-preserving decision trees over vertically partitioned data", journal = j-TKDD, volume = "2", number = "3", pages = "14:1--14:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409624", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Privacy and security concerns can prevent sharing of data, derailing data-mining projects. Distributed knowledge discovery, if done correctly, can alleviate this problem. We introduce a generalized privacy-preserving variant of the ID3 algorithm for vertically partitioned data distributed over two or more parties. Along with a proof of security, we discuss what would be necessary to make the protocols completely secure. We also provide experimental results, giving a first demonstration of the practical complexity of secure multiparty computation-based data mining.", acknowledgement = ack-nhfb, articleno = "14", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Decision tree classification; privacy", } @Article{Chuang:2009:FPS, author = "Kun-Ta Chuang and Hung-Leng Chen and Ming-Syan Chen", title = "Feature-preserved sampling over streaming data", journal = j-TKDD, volume = "2", number = "4", pages = "15:1--15:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460798", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article, we explore a novel sampling model, called {\em feature preserved sampling\/} ({\em FPS\/}) that sequentially generates a high-quality sample over sliding windows. The sampling quality we consider refers to the degree of consistency between the sample proportion and the population proportion of each attribute value in a window. Due to the time-variant nature of real-world datasets, users are more likely to be interested in the most recent data. However, previous works have not been able to generate a high-quality sample over sliding windows that precisely preserves up-to-date population characteristics. Motivated by this shortcoming, we have developed the {\em FPS\/} algorithm, which has several advantages: (1) it sequentially generates a sample from a time-variant data source over sliding windows; (2) the execution time of {\em FPS\/} is linear with respect to the database size; (3) the {\em relative\/} proportional differences between the sample proportions and population proportions of most distinct attribute values are guaranteed to be below a specified error threshold, $\epsilon$ , while the {\em relative\/} proportion differences of the remaining attribute values are as close to $\epsilon$ as possible, which ensures that the generated sample is of high quality; (4) the sample rate is close to the user specified rate so that a high quality sampling result can be obtained without increasing the sample size; (5) by a thorough analytical and empirical study, we prove that {\em FPS\/} has acceptable space overheads, especially when the attribute values have Zipfian distributions, and {\em FPS\/} can also excellently preserve the population proportion of multivariate features in the sample; and (6) {\em FPS\/} can be applied to infinite streams and finite datasets equally, and the generated samples can be used for various applications. Our experiments on both real and synthetic data validate that {\em FPS\/} can effectively obtain a high quality sample of the desired size. In addition, while using the sample generated by {\em FPS\/} in various mining applications, a significant improvement in efficiency can be achieved without compromising the model's precision.", acknowledgement = ack-nhfb, articleno = "15", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "sampling; Streaming mining", } @Article{Jiang:2009:MFC, author = "Daxin Jiang and Jian Pei", title = "Mining frequent cross-graph quasi-cliques", journal = j-TKDD, volume = "2", number = "4", pages = "16:1--16:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460799", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Joint mining of multiple datasets can often discover interesting, novel, and reliable patterns which cannot be obtained solely from any single source. For example, in bioinformatics, jointly mining multiple gene expression datasets obtained by different labs or during various biological processes may overcome the heavy noise in the data. Moreover, by joint mining of gene expression data and protein-protein interaction data, we may discover clusters of genes which show coherent expression patterns and also produce interacting proteins. Such clusters may be potential pathways.\par In this article, we investigate a novel data mining problem, {\em mining frequent cross-graph quasi-cliques}, which is generalized from several interesting applications in bioinformatics, cross-market customer segmentation, social network analysis, and Web mining. In a graph, a set of vertices $S$ is a $\gamma$-quasi-clique $(0 < \gamma \leq 1)$ if each vertex $v$ in $S$ directly connects to at least $\gamma \cdot (|S| - 1)$ other vertices in $S$. Given a set of graphs $G_1, \ldots{}, G_n$ and parameter ${\rm min\_sup} (0 < {\rm min\_sup} 1)$, a set of vertices $S$ is a frequent cross-graph quasi-clique if $S$ is a $\gamma$-quasi-clique in at least ${\rm min\_sup} \cdot n$ graphs, and there does not exist a proper superset of $S$ having the property.\par We build a general model, show why the complete set of frequent cross-graph quasi-cliques cannot be found by previous data mining methods, and study the complexity of the problem. While the problem is difficult, we develop practical algorithms which exploit several interesting and effective techniques and heuristics to efficaciously mine frequent cross-graph quasi-cliques. A systematic performance study is reported on both synthetic and real data sets. We demonstrate some interesting and meaningful frequent cross-graph quasi-cliques in bioinformatics. The experimental results also show that our algorithms are efficient and scalable.", acknowledgement = ack-nhfb, articleno = "16", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "bioinformatics; clique; Graph mining; joint mining", } @Article{Domeniconi:2009:WCE, author = "Carlotta Domeniconi and Muna Al-Razgan", title = "Weighted cluster ensembles: {Methods} and analysis", journal = j-TKDD, volume = "2", number = "4", pages = "17:1--17:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460800", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Cluster ensembles offer a solution to challenges inherent to clustering arising from its ill-posed nature. Cluster ensembles can provide robust and stable solutions by leveraging the consensus across multiple clustering results, while averaging out emergent spurious structures that arise due to the various biases to which each participating algorithm is tuned. In this article, we address the problem of combining multiple {\em weighted clusters\/} that belong to different subspaces of the input space. We leverage the diversity of the input clusterings in order to generate a consensus partition that is superior to the participating ones. Since we are dealing with weighted clusters, our consensus functions make use of the weight vectors associated with the clusters. We demonstrate the effectiveness of our techniques by running experiments with several real datasets, including high-dimensional text data. Furthermore, we investigate in depth the issue of diversity and accuracy for our ensemble methods. Our analysis and experimental results show that the proposed techniques are capable of producing a partition that is as good as or better than the best individual clustering.", acknowledgement = ack-nhfb, articleno = "17", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "accuracy and diversity measures; Cluster ensembles; consensus functions; data mining; subspace clustering; text data", } @Article{Zhang:2009:DGA, author = "Zhenjie Zhang and Laks V. S. Lakshmanan and Anthony K. H. Tung", title = "On domination game analysis for microeconomic data mining", journal = j-TKDD, volume = "2", number = "4", pages = "18:1--18:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460801", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Game theory is a powerful tool for analyzing the competitions among manufacturers in a market. In this article, we present a study on combining game theory and data mining by introducing the concept of domination game analysis. We present a multidimensional market model, where every dimension represents one attribute of a commodity. Every product or customer is represented by a point in the multidimensional space, and a product is said to ``dominate'' a customer if all of its attributes can satisfy the requirements of the customer. The expected market share of a product is measured by the expected number of the buyers in the customers, all of which are equally likely to buy any product dominating him. A Nash equilibrium is a configuration of the products achieving stable expected market shares for all products. We prove that Nash equilibrium in such a model can be computed in polynomial time if every manufacturer tries to modify its product in a round robin manner. To further improve the efficiency of the computation, we also design two algorithms for the manufacturers to efficiently find their best response to other products in the market.", acknowledgement = ack-nhfb, articleno = "18", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "data mining; Domination game; game theory", } @Article{Kriegel:2009:CHD, author = "Hans-Peter Kriegel and Peer Kr{\"o}ger and Arthur Zimek", title = "Clustering high-dimensional data: {A} survey on subspace clustering, pattern-based clustering, and correlation clustering", journal = j-TKDD, volume = "3", number = "1", pages = "1:1--1:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497578", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "As a prolific research area in data mining, subspace clustering and related problems induced a vast quantity of proposed solutions. However, many publications compare a new proposition --- if at all --- with one or two competitors, or even with a so-called ``na{\"\i}ve'' ad hoc solution, but fail to clarify the exact problem definition. As a consequence, even if two solutions are thoroughly compared experimentally, it will often remain unclear whether both solutions tackle the same problem or, if they do, whether they agree in certain tacit assumptions and how such assumptions may influence the outcome of an algorithm. In this survey, we try to clarify: (i) the different problem definitions related to subspace clustering in general; (ii) the specific difficulties encountered in this field of research; (iii) the varying assumptions, heuristics, and intuitions forming the basis of different approaches; and (iv) how several prominent solutions tackle different problems.", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "clustering; high-dimensional data; Survey", } @Article{Dhurandhar:2009:SAM, author = "Amit Dhurandhar and Alin Dobra", title = "Semi-analytical method for analyzing models and model selection measures based on moment analysis", journal = j-TKDD, volume = "3", number = "1", pages = "2:1--2:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497579", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article we propose a moment-based method for studying models and model selection measures. By focusing on the probabilistic space of classifiers induced by the classification algorithm rather than on that of datasets, we obtain efficient characterizations for computing the moments, which is followed by visualization of the resulting formulae that are too complicated for direct interpretation. By assuming the data to be drawn independently and identically distributed from the underlying probability distribution, and by going over the space of all possible datasets, we establish general relationships between the generalization error, hold-out-set error, cross-validation error, and leave-one-out error. We later exemplify the method and the results by studying the behavior of the errors for the naive Bayes classifier.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "classification; generalization error; Model selection", } @Article{Cerf:2009:CPM, author = "Lo{\"\i}c Cerf and J{\'e}r{\'e}my Besson and C{\'e}line Robardet and Jean-Fran{\c{c}}ois Boulicaut", title = "Closed patterns meet $n$-ary relations", journal = j-TKDD, volume = "3", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497580", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Set pattern discovery from binary relations has been extensively studied during the last decade. In particular, many complete and efficient algorithms for frequent closed set mining are now available. Generalizing such a task to $n$-ary relations ($n \geq 2$) appears as a timely challenge. It may be important for many applications, for example, when adding the time dimension to the popular {\em objects\/} $\times$ {\em features\/} binary case. The generality of the task (no assumption being made on the relation arity or on the size of its attribute domains) makes it computationally challenging. We introduce an algorithm called Data-Peeler. From an $n$-ary relation, it extracts all closed $n$-sets satisfying given piecewise (anti) monotonic constraints. This new class of constraints generalizes both monotonic and antimonotonic constraints. Considering the special case of ternary relations, Data-Peeler outperforms the state-of-the-art algorithms CubeMiner and Trias by orders of magnitude. These good performances must be granted to a new clever enumeration strategy allowing to efficiently enforce the closeness property. The relevance of the extracted closed $n$-sets is assessed on real-life 3-and 4-ary relations. Beyond natural 3-or 4-ary relations, expanding a relation with an additional attribute can help in enforcing rather abstract constraints such as the robustness with respect to binarization. Furthermore, a collection of closed $n$-sets is shown to be an excellent starting point to compute a tiling of the dataset.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "$n$-ary relations; Closed patterns; constraint properties; constraint-based mining; tiling", } @Article{Angiulli:2009:DEA, author = "Fabrizio Angiulli and Fabio Fassetti", title = "{DOLPHIN}: {An} efficient algorithm for mining distance-based outliers in very large datasets", journal = j-TKDD, volume = "3", number = "1", pages = "4:1--4:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497581", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this work a novel distance-based outlier detection algorithm, named DOLPHIN, working on disk-resident datasets and whose I/O cost corresponds to the cost of sequentially reading the input dataset file twice, is presented.\par It is both theoretically and empirically shown that the main memory usage of DOLPHIN amounts to a small fraction of the dataset and that DOLPHIN has linear time performance with respect to the dataset size. DOLPHIN gains efficiency by naturally merging together in a unified schema three strategies, namely the selection policy of objects to be maintained in main memory, usage of pruning rules, and similarity search techniques. Importantly, similarity search is accomplished by the algorithm without the need of preliminarily indexing the whole dataset, as other methods do.\par The algorithm is simple to implement and it can be used with any type of data, belonging to either metric or nonmetric spaces. Moreover, a modification to the basic method allows DOLPHIN to deal with the scenario in which the available buffer of main memory is smaller than its standard requirements. DOLPHIN has been compared with state-of-the-art distance-based outlier detection algorithms, showing that it is much more efficient.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Data mining; distance-based outliers; outlier detection", } @Article{Chen:2009:BAS, author = "Bee-Chung Chen and Raghu Ramakrishnan and Jude W. Shavlik and Pradeep Tamma", title = "Bellwether analysis: {Searching} for cost-effective query-defined predictors in large databases", journal = j-TKDD, volume = "3", number = "1", pages = "5:1--5:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497582", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How to mine massive datasets is a challenging problem with great potential value. Motivated by this challenge, much effort has concentrated on developing scalable versions of machine learning algorithms. However, the cost of mining large datasets is not just computational; preparing the datasets into the ``right form'' so that learning algorithms can be applied is usually costly, due to the human labor that is typically required and a large number of choices in data preparation, which include selecting different subsets of data and aggregating data at different granularities. We make the key observation that, for a number of practically motivated problems, these choices can be defined using database queries and analyzed in an automatic and systematic manner. Specifically, we propose a new class of data-mining problem, called {\em bellwether analysis}, in which the goal is to find a few query-defined predictors (e.g., first week sales of Peoria, IL of an item) that can be used to accurately predict the result of a target query (e.g., first year worldwide sales of the item) from a large number of queries that define candidate predictors. To make a prediction for a new item, the data needed to generate such predictors has to be collected (e.g., selling the new item in Peoria, IL for a week and collecting the sales data). A useful predictor is one that has high prediction accuracy and a low data-collection cost. We call such a cost-effective predictor a {\em bellwether}.\par This article introduces bellwether analysis, which integrates database query processing and predictive modeling into a single framework, and provides scalable algorithms for large datasets that cannot fit in main memory. Through a series of extensive experiments, we show that bellwethers do exist in real-world databases, and that our computation techniques achieve good efficiency on large datasets.", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "bellwether; Cost-effective prediction; data cube; OLAP queries; predictive models; scalable algorithms", } @Article{Liu:2009:ISI, author = "Huan Liu and John Salerno and Michael Young and Rakesh Agrawal and Philip S. Yu", title = "Introduction to special issue on social computing, behavioral modeling, and prediction", journal = j-TKDD, volume = "3", number = "2", pages = "6:1--6:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514889", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mehler:2009:ENC, author = "Andrew Mehler and Steven Skiena", title = "Expanding network communities from representative examples", journal = j-TKDD, volume = "3", number = "2", pages = "7:1--7:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514890", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present an approach to leverage a small subset of a coherent community within a social network into a much larger, more representative sample. Our problem becomes identifying a small conductance subgraph containing many (but not necessarily all) members of the given seed set. Starting with an initial seed set representing a sample of a community, we seek to discover as much of the full community as possible.\par We present a general method for network community expansion, demonstrating that our methods work well in expanding communities in real world networks starting from small given seed groups (20 to 400 members). Our approach is marked by incremental expansion from the seeds with retrospective analysis to determine the ultimate boundaries of our community. We demonstrate how to increase the robustness of the general approach through bootstrapping multiple random partitions of the input set into seed and evaluation groups.\par We go beyond statistical comparisons against gold standards to careful subjective evaluations of our expanded communities. This process explains the causes of most disagreement between our expanded communities and our gold-standards --- arguing that our expansion methods provide more reliable communities than can be extracted from reference sources/gazetteers such as Wikipedia.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "artificial intelligence; community discovery; Discrete mathematics; graph theory; news analysis; social networks", } @Article{Lin:2009:ACT, author = "Yu-Ru Lin and Yun Chi and Shenghuo Zhu and Hari Sundaram and Belle L. Tseng", title = "Analyzing communities and their evolutions in dynamic social networks", journal = j-TKDD, volume = "3", number = "2", pages = "8:1--8:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514891", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We discover communities from social network data and analyze the community evolution. These communities are inherent characteristics of human interaction in online social networks, as well as paper citation networks. Also, communities may evolve over time, due to changes to individuals' roles and social status in the network as well as changes to individuals' research interests. We present an innovative algorithm that deviates from the traditional two-step approach to analyze community evolutions. In the traditional approach, communities are first detected for each time slice, and then compared to determine correspondences. We argue that this approach is inappropriate in applications with noisy data. In this paper, we propose {\em FacetNet\/} for analyzing communities and their evolutions through a robust {\em unified\/} process. This novel framework will discover communities and capture their evolution with temporal smoothness given by historic community structures. Our approach relies on formulating the problem in terms of maximum a posteriori (MAP) estimation, where the community structure is estimated both by the observed networked data and by the prior distribution given by historic community structures. Then we develop an iterative algorithm, with proven low time complexity, which is guaranteed to converge to an optimal solution. We perform extensive experimental studies, on both synthetic datasets and real datasets, to demonstrate that our method discovers meaningful communities and provides additional insights not directly obtainable from traditional methods.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Community; community net; evolution; evolution net; nonnegative matrix factorization; soft membership", } @Article{Kimura:2009:BLM, author = "Masahiro Kimura and Kazumi Saito and Hiroshi Motoda", title = "Blocking links to minimize contamination spread in a social network", journal = j-TKDD, volume = "3", number = "2", pages = "9:1--9:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514892", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We address the problem of minimizing the propagation of undesirable things, such as computer viruses or malicious rumors, by blocking a limited number of links in a network, which is converse to the influence maximization problem in which the most influential nodes for information diffusion is searched in a social network. This minimization problem is more fundamental than the problem of preventing the spread of contamination by removing nodes in a network. We introduce two definitions for the contamination degree of a network, accordingly define two contamination minimization problems, and propose methods for efficiently finding good approximate solutions to these problems on the basis of a naturally greedy strategy. Using large social networks, we experimentally demonstrate that the proposed methods outperform conventional link-removal methods. We also show that unlike the case of blocking a limited number of nodes, the strategy of removing nodes with high out-degrees is not necessarily effective for these problems.", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Contamination diffusion; link analysis; social networks", } @Article{Agichtein:2009:MIS, author = "Eugene Agichtein and Yandong Liu and Jiang Bian", title = "Modeling information-seeker satisfaction in community question answering", journal = j-TKDD, volume = "3", number = "2", pages = "10:1--10:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514893", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Question Answering Communities such as Naver, Baidu Knows, and Yahoo! Answers have emerged as popular, and often effective, means of information seeking on the web. By posting questions for other participants to answer, information seekers can obtain specific answers to their questions. Users of CQA portals have already contributed millions of questions, and received hundreds of millions of answers from other participants. However, CQA is not always effective: in some cases, a user may obtain a perfect answer within minutes, and in others it may require hours --- and sometimes days --- until a satisfactory answer is contributed. We investigate the problem of predicting information seeker satisfaction in collaborative question answering communities, where we attempt to predict whether a question author will be satisfied with the answers submitted by the community participants. We present a general prediction model, and develop a variety of content, structure, and community-focused features for this task. Our experimental results, obtained from a large-scale evaluation over thousands of real questions and user ratings, demonstrate the feasibility of modeling and predicting asker satisfaction. We complement our results with a thorough investigation of the interactions and information seeking patterns in question answering communities that correlate with information seeker satisfaction. We also explore {\em personalized\/} models of asker satisfaction, and show that when sufficient interaction history exists, personalization can significantly improve prediction accuracy over a ``one-size-fits-all'' model. Our models and predictions could be useful for a variety of applications, such as user intent inference, answer ranking, interface design, and query suggestion and routing.", acknowledgement = ack-nhfb, articleno = "10", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Community question answering; information seeker satisfaction", } @Article{Torvik:2009:AND, author = "Vetle I. Torvik and Neil R. Smalheiser", title = "Author name disambiguation in {MEDLINE}", journal = j-TKDD, volume = "3", number = "3", pages = "11:1--11:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552304", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "{\em Background\/}: We recently described ``Author-ity,'' a model for estimating the probability that two articles in MEDLINE, sharing the same author name, were written by the same individual. Features include shared title words, journal name, coauthors, medical subject headings, language, affiliations, and author name features (middle initial, suffix, and prevalence in MEDLINE). Here we test the hypothesis that the Author-ity model will suffice to disambiguate author names for the vast majority of articles in MEDLINE. {\em Methods\/}: Enhancements include: (a) incorporating first names and their variants, email addresses, and correlations between specific last names and affiliation words; (b) new methods of generating large unbiased training sets; (c) new methods for estimating the prior probability; (d) a weighted least squares algorithm for correcting transitivity violations; and (e) a maximum likelihood based agglomerative algorithm for computing clusters of articles that represent inferred author-individuals. {\em Results\/}: Pairwise comparisons were computed for all author names on all 15.3 million articles in MEDLINE (2006 baseline), that share last name and first initial, to create Author-ity 2006, a database that has each name on each article assigned to one of 6.7 million inferred author-individual clusters. Recall is estimated at $\approx 98.8\%$. Lumping (putting two different individuals into the same cluster) affects $\approx 0.5\%$ of clusters, whereas splitting (assigning articles written by the same individual to $> 1$ cluster) affects $\approx 2\%$ of articles. {\em Impact\/}: The Author-ity model can be applied generally to other bibliographic databases. Author name disambiguation allows information retrieval and data integration to become {\em person-centered}, not just {\em document-centered}, setting the stage for new data mining and social network tools that will facilitate the analysis of scholarly publishing and collaboration behavior. {\em Availability\/}: The Author-ity 2006 database is available for nonprofit academic research, and can be freely queried via http://arrowsmith.psych.uic.edu.", acknowledgement = ack-nhfb, articleno = "11", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "bibliographic databases; Name disambiguation", } @Article{Tu:2009:SDC, author = "Li Tu and Yixin Chen", title = "Stream data clustering based on grid density and attraction", journal = j-TKDD, volume = "3", number = "3", pages = "12:1--12:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552305", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Clustering real-time stream data is an important and challenging problem. Existing algorithms such as CluStream are based on the {\em k\/} -means algorithm. These clustering algorithms have difficulties finding clusters of arbitrary shapes and handling outliers. Further, they require the knowledge of {\em k\/} and user-specified time window. To address these issues, this article proposes {\em D-Stream}, a framework for clustering stream data using a density-based approach.\par Our algorithm uses an online component that maps each input data record into a grid and an offline component that computes the grid density and clusters the grids based on the density. The algorithm adopts a density decaying technique to capture the dynamic changes of a data stream and a attraction-based mechanism to accurately generate cluster boundaries.\par Exploiting the intricate relationships among the decay factor, attraction, data density, and cluster structure, our algorithm can efficiently and effectively generate and adjust the clusters in real time. Further, a theoretically sound technique is developed to detect and remove sporadic grids mapped by outliers in order to dramatically improve the space and time efficiency of the system. The technique makes high-speed data stream clustering feasible without degrading the clustering quality. The experimental results show that our algorithm has superior quality and efficiency, can find clusters of arbitrary shapes, and can accurately recognize the evolving behaviors of real-time data streams.", acknowledgement = ack-nhfb, articleno = "12", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "clustering; data mining; density-based algorithms; Stream data", } @Article{Zhou:2009:LST, author = "Bin Zhou and Jian Pei", title = "Link spam target detection using page farms", journal = j-TKDD, volume = "3", number = "3", pages = "13:1--13:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552306", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Currently, most popular Web search engines adopt some link-based ranking methods such as PageRank. Driven by the huge potential benefit of improving rankings of Web pages, many tricks have been attempted to boost page rankings. The most common way, which is known as link spam, is to make up some artificially designed link structures. Detecting link spam effectively is a big challenge. In this article, we develop novel and effective detection methods for link spam target pages using page farms. The essential idea is intuitive: whether a page is the beneficiary of link spam is reflected by how it collects its PageRank score. Technically, how a target page collects its PageRank score is modeled by a page farm, which consists of pages contributing a major portion of the PageRank score of the target page. We propose two spamicity measures based on page farms. They can be used as an effective measure to check whether the pages are link spam target pages. An empirical study using a newly available real dataset strongly suggests that our method is effective. It outperforms the state-of-the-art methods like SpamRank and SpamMass in both precision and recall.", acknowledgement = ack-nhfb, articleno = "13", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Link Spam; Page Farm; PageRank", } @Article{Wan:2009:DBC, author = "Li Wan and Wee Keong Ng and Xuan Hong Dang and Philip S. Yu and Kuan Zhang", title = "Density-based clustering of data streams at multiple resolutions", journal = j-TKDD, volume = "3", number = "3", pages = "14:1--14:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552307", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In data stream clustering, it is desirable to have algorithms that are able to detect clusters of arbitrary shape, clusters that evolve over time, and clusters with noise. Existing stream data clustering algorithms are generally based on an online-offline approach: The online component captures synopsis information from the data stream (thus, overcoming real-time and memory constraints) and the offline component generates clusters using the stored synopsis. The online-offline approach affects the overall performance of stream data clustering in various ways: the ease of deriving synopsis from streaming data; the complexity of data structure for storing and managing synopsis; and the frequency at which the offline component is used to generate clusters. In this article, we propose an algorithm that (1) computes and updates synopsis information in constant time; (2) allows users to discover clusters at multiple resolutions; (3) determines the right time for users to generate clusters from the synopsis information; (4) generates clusters of higher purity than existing algorithms; and (5) determines the right threshold function for density-based clustering based on the fading model of stream data. To the best of our knowledge, no existing data stream algorithms has all of these features. Experimental results show that our algorithm is able to detect arbitrarily shaped, evolving clusters with high quality.", acknowledgement = ack-nhfb, articleno = "14", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Data mining algorithms; density based clustering; evolving data streams", } @Article{Mannila:2009:ATS, author = "Heikki Mannila and Dimitrios Gunopulos", title = "{ACM TKDD} special issue {ACM SIGKDD 2007} and {ACM SIGKDD 2008}", journal = j-TKDD, volume = "3", number = "4", pages = "15:1--15:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631163", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Asur:2009:EBF, author = "Sitaram Asur and Srinivasan Parthasarathy and Duygu Ucar", title = "An event-based framework for characterizing the evolutionary behavior of interaction graphs", journal = j-TKDD, volume = "3", number = "4", pages = "16:1--16:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631164", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Interaction graphs are ubiquitous in many fields such as bioinformatics, sociology and physical sciences. There have been many studies in the literature targeted at studying and mining these graphs. However, almost all of them have studied these graphs from a static point of view. The study of the evolution of these graphs over time can provide tremendous insight on the behavior of entities, communities and the flow of information among them. In this work, we present an event-based characterization of critical behavioral patterns for temporally varying interaction graphs. We use nonoverlapping snapshots of interaction graphs and develop a framework for capturing and identifying interesting events from them. We use these events to characterize complex behavioral patterns of individuals and communities over time. We show how semantic information can be incorporated to reason about community-behavior events. We also demonstrate the application of behavioral patterns for the purposes of modeling evolution, link prediction and influence maximization. Finally, we present a diffusion model for evolving networks, based on our framework.", acknowledgement = ack-nhfb, articleno = "16", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "diffusion of innovations; Dynamic interaction networks; evolutionary analysis", } @Article{Chi:2009:ESC, author = "Yun Chi and Xiaodan Song and Dengyong Zhou and Koji Hino and Belle L. Tseng", title = "On evolutionary spectral clustering", journal = j-TKDD, volume = "3", number = "4", pages = "17:1--17:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631165", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Evolutionary clustering is an emerging research area essential to important applications such as clustering dynamic Web and blog contents and clustering data streams. In evolutionary clustering, a good clustering result should fit the current data well, while simultaneously not deviate too dramatically from the recent history. To fulfill this dual purpose, a measure of {\em temporal smoothness\/} is integrated in the overall measure of clustering quality. In this article, we propose two frameworks that incorporate temporal smoothness in evolutionary spectral clustering. For both frameworks, we start with intuitions gained from the well-known {\em k\/} -means clustering problem, and then propose and solve corresponding cost functions for the evolutionary spectral clustering problems. Our solutions to the evolutionary spectral clustering problems provide more stable and consistent clustering results that are less sensitive to short-term noises while at the same time are adaptive to long-term cluster drifts. Furthermore, we demonstrate that our methods provide the optimal solutions to the relaxed versions of the corresponding evolutionary {\em k\/} -means clustering problems. Performance experiments over a number of real and synthetic data sets illustrate our evolutionary spectral clustering methods provide more robust clustering results that are not sensitive to noise and can adapt to data drifts.", acknowledgement = ack-nhfb, articleno = "17", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Evolutionary spectral clustering; preserving cluster membership; preserving cluster quality; temporal smoothness", } @Article{Fujiwara:2009:FLS, author = "Yasuhiro Fujiwara and Yasushi Sakurai and Masaru Kitsuregawa", title = "Fast likelihood search for hidden {Markov} models", journal = j-TKDD, volume = "3", number = "4", pages = "18:1--18:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631166", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Hidden Markov models (HMMs) are receiving considerable attention in various communities and many applications that use HMMs have emerged such as mental task classification, biological analysis, traffic monitoring, and anomaly detection. This article has two goals; The first goal is exact and efficient identification of the model whose state sequence has the highest likelihood for the given query sequence (more precisely, no HMM that actually has a high-probability path for the given sequence is missed by the algorithm), and the second goal is exact and efficient monitoring of streaming data sequences to find the best model. We propose SPIRAL, a fast search method for HMM datasets. SPIRAL is based on three ideas; (1) it clusters states of models to compute approximate likelihood, (2) it uses several granularities and approximates likelihood values in search processing, and (3) it focuses on just the promising likelihood computations by pruning out low-likelihood state sequences. Experiments verify the effectiveness of SPIRAL and show that it is more than 490 times faster than the naive method.", acknowledgement = ack-nhfb, articleno = "18", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Hidden Markov model; likelihood; upper bound", } @Article{Zhang:2009:EAG, author = "Xiang Zhang and Fei Zou and Wei Wang", title = "Efficient algorithms for genome-wide association study", journal = j-TKDD, volume = "3", number = "4", pages = "19:1--19:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631167", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Studying the association between quantitative phenotype (such as height or weight) and single nucleotide polymorphisms (SNPs) is an important problem in biology. To understand underlying mechanisms of complex phenotypes, it is often necessary to consider joint genetic effects across multiple SNPs. ANOVA (analysis of variance) test is routinely used in association study. Important findings from studying gene-gene (SNP-pair) interactions are appearing in the literature. However, the number of SNPs can be up to millions. Evaluating joint effects of SNPs is a challenging task even for SNP-pairs. Moreover, with large number of SNPs correlated, permutation procedure is preferred over simple Bonferroni correction for properly controlling family-wise error rate and retaining mapping power, which dramatically increases the computational cost of association study.\par In this article, we study the problem of finding SNP-pairs that have significant associations with a given quantitative phenotype. We propose an efficient algorithm, FastANOVA, for performing ANOVA tests on SNP-pairs in a batch mode, which also supports large permutation test. We derive an upper bound of SNP-pair ANOVA test, which can be expressed as the sum of two terms. The first term is based on single-SNP ANOVA test. The second term is based on the SNPs and independent of any phenotype permutation. Furthermore, SNP-pairs can be organized into groups, each of which shares a common upper bound. This allows for maximum reuse of intermediate computation, efficient upper bound estimation, and effective SNP-pair pruning. Consequently, FastANOVA only needs to perform the ANOVA test on a small number of candidate SNP-pairs without the risk of missing any significant ones. Extensive experiments demonstrate that FastANOVA is orders of magnitude faster than the brute-force implementation of ANOVA tests on all SNP pairs. The principles used in FastANOVA can be applied to categorical phenotypes and other statistics such as Chi-square test.", acknowledgement = ack-nhfb, articleno = "19", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "ANOVA test; Association study; permutation test", } @Article{Bilgic:2009:RCM, author = "Mustafa Bilgic and Lise Getoor", title = "Reflect and correct: {A} misclassification prediction approach to active inference", journal = j-TKDD, volume = "3", number = "4", pages = "20:1--20:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631168", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Information diffusion, viral marketing, graph-based semi-supervised learning, and collective classification all attempt to model and exploit the relationships among nodes in a network to improve the performance of node labeling algorithms. However, sometimes the advantage of exploiting the relationships can become a disadvantage. Simple models like label propagation and iterative classification can aggravate a misclassification by propagating mistakes in the network, while more complex models that define and optimize a global objective function, such as Markov random fields and graph mincuts, can misclassify a set of nodes jointly. This problem can be mitigated if the classification system is allowed to ask for the correct labels for a few of the nodes during inference. However, determining the optimal set of labels to acquire is intractable under relatively general assumptions, which forces us to resort to approximate and heuristic techniques. We describe three such techniques in this article. The first one is based on directly approximating the value of the objective function of label acquisition and greedily acquiring the label that provides the most improvement. The second technique is a simple technique based on the analogy we draw between viral marketing and label acquisition. Finally, we propose a method, which we refer to as {\em reflect and correct}, that can learn and predict when the classification system is likely to make mistakes and suggests acquisitions to correct those mistakes. We empirically show on a variety of synthetic and real-world datasets that the reflect and correct method significantly outperforms the other two techniques, as well as other approaches based on network structural measures such as node degree and network clustering.", acknowledgement = ack-nhfb, articleno = "20", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Active inference; collective classification; information diffusion; label acquisition; viral marketing", } @Article{Kiernan:2009:CCS, author = "Jerry Kiernan and Evimaria Terzi", title = "Constructing comprehensive summaries of large event sequences", journal = j-TKDD, volume = "3", number = "4", pages = "21:1--21:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631169", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Event sequences capture system and user activity over time. Prior research on sequence mining has mostly focused on discovering local patterns appearing in a sequence. While interesting, these patterns do not give a comprehensive summary of the entire event sequence. Moreover, the number of patterns discovered can be large. In this article, we take an alternative approach and build {\em short\/} summaries that describe an entire sequence, and discover local dependencies between event types.\par We formally define the summarization problem as an optimization problem that balances shortness of the summary with accuracy of the data description. We show that this problem can be solved optimally in polynomial time by using a combination of two dynamic-programming algorithms. We also explore more efficient greedy alternatives and demonstrate that they work well on large datasets. Experiments on both synthetic and real datasets illustrate that our algorithms are efficient and produce high-quality results, and reveal interesting local structures in the data.", acknowledgement = ack-nhfb, articleno = "21", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Event sequences; log mining; summarization", } @Article{Koren:2010:FNS, author = "Yehuda Koren", title = "Factor in the neighbors: {Scalable} and accurate collaborative filtering", journal = j-TKDD, volume = "4", number = "1", pages = "1:1--1:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644874", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recommender systems provide users with personalized suggestions for products or services. These systems often rely on collaborating filtering (CF), where past transactions are analyzed in order to establish connections between users and products. The most common approach to CF is based on neighborhood models, which originate from similarities between products or users. In this work we introduce a new neighborhood model with an improved prediction accuracy. Unlike previous approaches that are based on heuristic similarities, we model neighborhood relations by minimizing a global cost function. Further accuracy improvements are achieved by extending the model to exploit both explicit and implicit feedback by the users. Past models were limited by the need to compute all pairwise similarities between items or users, which grow quadratically with input size. In particular, this limitation vastly complicates adopting user similarity models, due to the typical large number of users. Our new model solves these limitations by factoring the neighborhood model, thus making both item-item and user-user implementations scale linearly with the size of the data. The methods are tested on the Netflix data, with encouraging results.", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "collaborative filtering; Netflix Prize; Recommender systems", } @Article{Syed:2010:MDP, author = "Zeeshan Syed and Collin Stultz and Manolis Kellis and Piotr Indyk and John Guttag", title = "Motif discovery in physiological datasets: {A} methodology for inferring predictive elements", journal = j-TKDD, volume = "4", number = "1", pages = "2:1--2:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644875", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article, we propose a methodology for identifying predictive physiological patterns in the absence of prior knowledge. We use the principle of conservation to identify activity that consistently precedes an outcome in patients, and describe a two-stage process that allows us to efficiently search for such patterns in large datasets. This involves first transforming continuous physiological signals from patients into symbolic sequences, and then searching for patterns in these reduced representations that are strongly associated with an outcome.\par Our strategy of identifying conserved activity that is unlikely to have occurred purely by chance in symbolic data is analogous to the discovery of regulatory motifs in genomic datasets. We build upon existing work in this area, generalizing the notion of a regulatory motif and enhancing current techniques to operate robustly on non-genomic data. We also address two significant considerations associated with motif discovery in general: computational efficiency and robustness in the presence of degeneracy and noise. To deal with these issues, we introduce the concept of active regions and new subset-based techniques such as a two-layer Gibbs sampling algorithm. These extensions allow for a framework for information inference, where precursors are identified as approximately conserved activity of arbitrary complexity preceding multiple occurrences of an event.\par We evaluated our solution on a population of patients who experienced sudden cardiac death and attempted to discover electrocardiographic activity that may be associated with the endpoint of death. To assess the predictive patterns discovered, we compared likelihood scores for motifs in the sudden death population against control populations of normal individuals and those with non-fatal supraventricular arrhythmias. Our results suggest that predictive motif discovery may be able to identify clinically relevant information even in the absence of significant prior knowledge.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "data mining; Gibbs sampling; inference; knowledge discovery; motifs; physiological signals", } @Article{Webb:2010:SSI, author = "Geoffrey I. Webb", title = "Self-sufficient itemsets: {An} approach to screening potentially interesting associations between items", journal = j-TKDD, volume = "4", number = "1", pages = "3:1--3:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644876", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Self-sufficient itemsets are those whose frequency cannot be explained solely by the frequency of either their subsets or of their supersets. We argue that itemsets that are not self-sufficient will often be of little interest to the data analyst, as their frequency should be expected once that of the itemsets on which their frequency depends is known. We present tests for statistically sound discovery of self-sufficient itemsets, and computational techniques that allow those tests to be applied as a post-processing step for any itemset discovery algorithm. We also present a measure for assessing the degree of potential interest in an itemset that complements these statistical measures.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Association discovery; association rules; itemset discovery; itemset screening; statistical evaluation", } @Article{Plantevit:2010:MMM, author = "Marc Plantevit and Anne Laurent and Dominique Laurent and Maguelonne Teisseire and Yeow Wei Choong", title = "Mining multidimensional and multilevel sequential patterns", journal = j-TKDD, volume = "4", number = "1", pages = "4:1--4:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644877", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multidimensional databases have been designed to provide decision makers with the necessary tools to help them understand their data. This framework is different from transactional data as the datasets contain huge volumes of historicized and aggregated data defined over a set of dimensions that can be arranged through multiple levels of granularities. Many tools have been proposed to query the data and navigate through the levels of granularity. However, automatic tools are still missing to mine this type of data in order to discover regular specific patterns. In this article, we present a method for mining sequential patterns from multidimensional databases, at the same time taking advantage of the different dimensions and levels of granularity, which is original compared to existing work. The necessary definitions and algorithms are extended from regular sequential patterns to this particular case. Experiments are reported, showing the significance of this approach.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "frequent patterns; hierarchy; multidimensional databases; multilevel patterns; Sequential patterns", } @Article{Zaki:2010:VVO, author = "Mohammed J. Zaki and Christopher D. Carothers and Boleslaw K. Szymanski", title = "{VOGUE}: {A} variable order hidden {Markov} model with duration based on frequent sequence mining", journal = j-TKDD, volume = "4", number = "1", pages = "5:1--5:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644878", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present VOGUE, a novel, variable order hidden Markov model with state durations, that combines two separate techniques for modeling complex patterns in sequential data: pattern mining and data modeling. VOGUE relies on a variable gap sequence mining method to extract frequent patterns with different lengths and gaps between elements. It then uses these mined sequences to build a variable order hidden Markov model (HMM), that explicitly models the gaps. The gaps implicitly model the order of the HMM, and they explicitly model the duration of each state. We apply VOGUE to a variety of real sequence data taken from domains such as protein sequence classification, Web usage logs, intrusion detection, and spelling correction. We show that VOGUE has superior classification accuracy compared to regular HMMs, higher-order HMMs, and even special purpose HMMs like HMMER, which is a state-of-the-art method for protein classification. The VOGUE implementation and the datasets used in this article are available as open-source.$^1$", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Hidden Markov models; higher-order HMM; HMM with duration; sequence mining and modeling; variable-order HMM", } @Article{Vadera:2010:CCS, author = "Sunil Vadera", title = "{CSNL}: {A} cost-sensitive non-linear decision tree algorithm", journal = j-TKDD, volume = "4", number = "2", pages = "6:1--6:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754429", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article presents a new decision tree learning algorithm called CSNL that induces Cost-Sensitive Non-Linear decision trees. The algorithm is based on the hypothesis that nonlinear decision nodes provide a better basis than axis-parallel decision nodes and utilizes discriminant analysis to construct nonlinear decision trees that take account of costs of misclassification.\par The performance of the algorithm is evaluated by applying it to seventeen datasets and the results are compared with those obtained by two well known cost-sensitive algorithms, ICET and MetaCost, which generate multiple trees to obtain some of the best results to date. The results show that CSNL performs at least as well, if not better than these algorithms, in more than twelve of the datasets and is considerably faster. The use of bagging with CSNL further enhances its performance showing the significant benefits of using nonlinear decision nodes.", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "cost-sensitive learning; Decision tree learning", } @Article{Kandylas:2010:AKC, author = "Vasileios Kandylas and S. Phineas Upham and Lyle H. Ungar", title = "Analyzing knowledge communities using foreground and background clusters", journal = j-TKDD, volume = "4", number = "2", pages = "7:1--7:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754430", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Insight into the growth (or shrinkage) of ``knowledge communities'' of authors that build on each other's work can be gained by studying the evolution over time of clusters of documents. We cluster documents based on the documents they cite in common using the Streemer clustering method, which finds cohesive foreground clusters (the knowledge communities) embedded in a diffuse background. We build predictive models with features based on the citation structure, the vocabulary of the papers, and the affiliations and prestige of the authors and use these models to study the drivers of community growth and the predictors of how widely a paper will be cited. We find that scientific knowledge communities tend to grow more rapidly if their publications build on diverse information and use narrow vocabulary and that papers that lie on the periphery of a community have the highest impact, while those not in any community have the lowest impact.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "citation analysis; clustering; community evolution; knowledge communities; Text mining", } @Article{Ji:2010:SSL, author = "Shuiwang Ji and Lei Tang and Shipeng Yu and Jieping Ye", title = "A shared-subspace learning framework for multi-label classification", journal = j-TKDD, volume = "4", number = "2", pages = "8:1--8:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754431", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multi-label problems arise in various domains such as multi-topic document categorization, protein function prediction, and automatic image annotation. One natural way to deal with such problems is to construct a binary classifier for each label, resulting in a set of independent binary classification problems. Since multiple labels share the same input space, and the semantics conveyed by different labels are usually correlated, it is essential to exploit the correlation information contained in different labels. In this paper, we consider a general framework for extracting shared structures in multi-label classification. In this framework, a common subspace is assumed to be shared among multiple labels. We show that the optimal solution to the proposed formulation can be obtained by solving a generalized eigenvalue problem, though the problem is nonconvex. For high-dimensional problems, direct computation of the solution is expensive, and we develop an efficient algorithm for this case. One appealing feature of the proposed framework is that it includes several well-known algorithms as special cases, thus elucidating their intrinsic relationships. We further show that the proposed framework can be extended to the kernel-induced feature space. We have conducted extensive experiments on multi-topic web page categorization and automatic gene expression pattern image annotation tasks, and results demonstrate the effectiveness of the proposed formulation in comparison with several representative algorithms.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "gene expression pattern image annotation; kernel methods; least squares loss; Multi-label classification; shared subspace; singular value decomposition; web page categorization", } @Article{Ruggieri:2010:DMD, author = "Salvatore Ruggieri and Dino Pedreschi and Franco Turini", title = "Data mining for discrimination discovery", journal = j-TKDD, volume = "4", number = "2", pages = "9:1--9:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754432", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In the context of civil rights law, discrimination refers to unfair or unequal treatment of people based on membership to a category or a minority, without regard to individual merit. Discrimination in credit, mortgage, insurance, labor market, and education has been investigated by researchers in economics and human sciences. With the advent of automatic decision support systems, such as credit scoring systems, the ease of data collection opens several challenges to data analysts for the fight against discrimination. In this article, we introduce the problem of discovering discrimination through data mining in a dataset of historical decision records, taken by humans or by automatic systems. We formalize the processes of direct and indirect discrimination discovery by modelling protected-by-law groups and contexts where discrimination occurs in a classification rule based syntax. Basically, classification rules extracted from the dataset allow for unveiling contexts of unlawful discrimination, where the degree of burden over protected-by-law groups is formalized by an extension of the lift measure of a classification rule. In direct discrimination, the extracted rules can be directly mined in search of discriminatory contexts. In indirect discrimination, the mining process needs some background knowledge as a further input, for example, census data, that combined with the extracted rules might allow for unveiling contexts of discriminatory decisions. A strategy adopted for combining extracted classification rules with background knowledge is called an inference model. In this article, we propose two inference models and provide automatic procedures for their implementation. An empirical assessment of our results is provided on the German credit dataset and on the PKDD Discovery Challenge 1999 financial dataset.", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "classification rules; Discrimination", } @Article{Thomas:2010:MMF, author = "Lini T. Thomas and Satyanarayana R. Valluri and Kamalakar Karlapalem", title = "{MARGIN}: {Maximal} frequent subgraph mining", journal = j-TKDD, volume = "4", number = "3", pages = "10:1--10:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839491", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Deodhar:2010:SFS, author = "Meghana Deodhar and Joydeep Ghosh", title = "{SCOAL}: {A} framework for simultaneous co-clustering and learning from complex data", journal = j-TKDD, volume = "4", number = "3", pages = "11:1--11:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839492", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2010:BBI, author = "Jinlin Chen and Keli Xiao", title = "{BISC}: {A} bitmap itemset support counting approach for efficient frequent itemset mining", journal = j-TKDD, volume = "4", number = "3", pages = "12:1--12:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839493", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Becchetti:2010:EAL, author = "Luca Becchetti and Paolo Boldi and Carlos Castillo and Aristides Gionis", title = "Efficient algorithms for large-scale local triangle counting", journal = j-TKDD, volume = "4", number = "3", pages = "13:1--13:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839494", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2010:MDR, author = "Yin Zhang and Zhi-Hua Zhou", title = "Multilabel dimensionality reduction via dependence maximization", journal = j-TKDD, volume = "4", number = "3", pages = "14:1--14:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839495", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Cui:2010:LMN, author = "Ying Cui and Xiaoli Z. Fern and Jennifer G. Dy", title = "Learning multiple nonredundant clusterings", journal = j-TKDD, volume = "4", number = "3", pages = "15:1--15:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839496", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2010:TSI, author = "Wei Wang", title = "{TKDD} Special Issue: {SIGKDD 2009}", journal = j-TKDD, volume = "4", number = "4", pages = "16:1--16:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857948", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2010:BTA, author = "Ye Chen and Dmitry Pavlov and John F. Canny", title = "Behavioral Targeting: The Art of Scaling Up Simple Algorithms", journal = j-TKDD, volume = "4", number = "4", pages = "17:1--17:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857949", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mohammed:2010:CDA, author = "Noman Mohammed and Benjamin C. M. Fung and Patrick C. K. Hung and Cheuk-Kwong Lee", title = "Centralized and Distributed Anonymization for High-Dimensional Healthcare Data", journal = j-TKDD, volume = "4", number = "4", pages = "18:1--18:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857950", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2010:BBM, author = "Chao Liu and Fan Guo and Christos Faloutsos", title = "{Bayesian} Browsing Model: Exact Inference of Document Relevance from Petabyte-Scale Data", journal = j-TKDD, volume = "4", number = "4", pages = "19:1--19:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857951", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wu:2010:MAF, author = "Mingxi Wu and Chris Jermaine and Sanjay Ranka and Xiuyao Song and John Gums", title = "A Model-Agnostic Framework for Fast Spatial Anomaly Detection", journal = j-TKDD, volume = "4", number = "4", pages = "20:1--20:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857952", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhong:2010:ATS, author = "Ning Zhong and Gregory Piatetsky-Shapiro and Yiyu Yao and Philip S. Yu", title = "{ACM TKDD} Special Issue on Knowledge Discovery for {Web} Intelligence", journal = j-TKDD, volume = "5", number = "1", pages = "1:1--1:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870097", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tang:2010:CAW, author = "Jie Tang and Limin Yao and Duo Zhang and Jing Zhang", title = "A Combination Approach to {Web} User Profiling", journal = j-TKDD, volume = "5", number = "1", pages = "2:1--2:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870098", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bouguessa:2010:DKS, author = "Mohamed Bouguessa and Shengrui Wang and Benoit Dumoulin", title = "Discovering Knowledge-Sharing Communities in Question-Answering Forums", journal = j-TKDD, volume = "5", number = "1", pages = "3:1--3:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870099", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Plangprasopchok:2010:MSA, author = "Anon Plangprasopchok and Kristina Lerman", title = "Modeling Social Annotation: {A} {Bayesian} Approach", journal = j-TKDD, volume = "5", number = "1", pages = "4:1--4:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870100", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Sakurai:2010:FDG, author = "Yasushi Sakurai and Christos Faloutsos and Spiros Papadimitriou", title = "Fast Discovery of Group Lag Correlations in Streams", journal = j-TKDD, volume = "5", number = "1", pages = "5:1--5:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870101", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2010:FCP, author = "Kun Liu and Evimaria Terzi", title = "A Framework for Computing the Privacy Scores of Users in Online Social Networks", journal = j-TKDD, volume = "5", number = "1", pages = "6:1--6:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870102", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Sun:2011:ISI, author = "Jimeng Sun and Yan Liu and Jie Tang and Chid Apte", title = "Introduction to Special Issue on Large-Scale Data Mining", journal = j-TKDD, volume = "5", number = "2", pages = "7:1--7:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921633", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Kang:2011:HMR, author = "U. Kang and Charalampos E. Tsourakakis and Ana Paula Appel and Christos Faloutsos and Jure Leskovec", title = "{HADI}: Mining Radii of Large Graphs", journal = j-TKDD, volume = "5", number = "2", pages = "8:1--8:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921634", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{deVries:2011:RRL, author = "Timothy de Vries and Hui Ke and Sanjay Chawla and Peter Christen", title = "Robust Record Linkage Blocking Using Suffix Arrays and {Bloom} Filters", journal = j-TKDD, volume = "5", number = "2", pages = "9:1--9:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921635", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Dunlavy:2011:TLP, author = "Daniel M. Dunlavy and Tamara G. Kolda and Evrim Acar", title = "Temporal Link Prediction Using Matrix and Tensor Factorizations", journal = j-TKDD, volume = "5", number = "2", pages = "10:1--10:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921636", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Magdalinos:2011:ECQ, author = "Panagis Magdalinos and Christos Doulkeridis and Michalis Vazirgiannis", title = "Enhancing Clustering Quality through Landmark-Based Dimensionality Reduction", journal = j-TKDD, volume = "5", number = "2", pages = "11:1--11:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921637", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Cheng:2011:CLA, author = "Hong Cheng and Yang Zhou and Jeffrey Xu Yu", title = "Clustering Large Attributed Graphs: {A} Balance between Structural and Attribute Similarities", journal = j-TKDD, volume = "5", number = "2", pages = "12:1--12:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921638", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Menon:2011:FAA, author = "Aditya Krishna Menon and Charles Elkan", title = "Fast Algorithms for Approximating the Singular Value Decomposition", journal = j-TKDD, volume = "5", number = "2", pages = "13:1--13:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921639", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A low-rank approximation to a matrix $A$ is a matrix with significantly smaller rank than $A$, and which is close to $A$ according to some norm. Many practical applications involving the use of large matrices focus on low-rank approximations. By reducing the rank or dimensionality of the data, we reduce the complexity of analyzing the data. The singular value decomposition is the most popular low-rank matrix approximation. However, due to its expensive computational requirements, it has often been considered intractable for practical applications involving massive data. Recent developments have tried to address this problem, with several methods proposed to approximate the decomposition with better asymptotic runtime. We present an empirical study of these techniques on a variety of dense and sparse datasets. We find that a sampling approach of Drineas, Kannan and Mahoney is often, but not always, the best performing method. This method gives solutions with high accuracy much faster than classical SVD algorithms, on large sparse datasets in particular. Other modern methods, such as a recent algorithm by Rokhlin and Tygert, also offer savings compared to classical SVD algorithms. The older sampling methods of Achlioptas and McSherry are shown to sometimes take longer than classical SVD.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2011:IDC, author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yun Chi and Yihong Gong", title = "Integrating Document Clustering and Multidocument Summarization", journal = j-TKDD, volume = "5", number = "3", pages = "14:1--14:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993078", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Maier:2011:INS, author = "Marc Maier and Matthew Rattigan and David Jensen", title = "Indexing Network Structure with Shortest-Path Trees", journal = j-TKDD, volume = "5", number = "3", pages = "15:1--15:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993079", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wong:2011:CUA, author = "Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Ke Wang and Philip S. Yu and Jian Pei", title = "Can the Utility of Anonymized Data be Used for Privacy Breaches?", journal = j-TKDD, volume = "5", number = "3", pages = "16:1--16:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993080", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lin:2011:CDM, author = "Yu-Ru Lin and Jimeng Sun and Hari Sundaram and Aisling Kelliher and Paul Castro and Ravi Konuru", title = "Community Discovery via Metagraph Factorization", journal = j-TKDD, volume = "5", number = "3", pages = "17:1--17:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993081", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Elkan:2012:GES, author = "Charles Elkan and Yehuda Koren", title = "Guest Editorial for Special Issue {KDD'10}", journal = j-TKDD, volume = "5", number = "4", pages = "18:1--18:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086738", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Iwata:2012:SMT, author = "Tomoharu Iwata and Takeshi Yamada and Yasushi Sakurai and Naonori Ueda", title = "Sequential Modeling of Topic Dynamics with Multiple Timescales", journal = j-TKDD, volume = "5", number = "4", pages = "19:1--19:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086739", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We propose an online topic model for sequentially analyzing the time evolution of topics in document collections. Topics naturally evolve with multiple timescales. For example, some words may be used consistently over one hundred years, while other words emerge and disappear over periods of a few days. Thus, in the proposed model, current topic-specific distributions over words are assumed to be generated based on the multiscale word distributions of the previous epoch. Considering both the long- and short-timescale dependency yields a more robust model. We derive efficient online inference procedures based on a stochastic EM algorithm, in which the model is sequentially updated using newly obtained data; this means that past data are not required to make the inference. We demonstrate the effectiveness of the proposed method in terms of predictive performance and computational efficiency by examining collections of real documents with timestamps.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huh:2012:DTM, author = "Seungil Huh and Stephen E. Fienberg", title = "Discriminative Topic Modeling Based on Manifold Learning", journal = j-TKDD, volume = "5", number = "4", pages = "20:1--20:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086740", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Topic modeling has become a popular method used for data analysis in various domains including text documents. Previous topic model approaches, such as probabilistic Latent Semantic Analysis (pLSA) and Latent Dirichlet Allocation (LDA), have shown impressive success in discovering low-rank hidden structures for modeling text documents. These approaches, however do not take into account the manifold structure of the data, which is generally informative for nonlinear dimensionality reduction mapping. More recent topic model approaches, Laplacian PLSI (LapPLSI) and Locally-consistent Topic Model (LTM), have incorporated the local manifold structure into topic models and have shown resulting benefits. But they fall short of achieving full discriminating power of manifold learning as they only enhance the proximity between the low-rank representations of neighboring pairs without any consideration for non-neighboring pairs. In this article, we propose a new approach, Discriminative Topic Model (DTM), which separates non-neighboring pairs from each other in addition to bringing neighboring pairs closer together, thereby preserving the global manifold structure as well as improving local consistency. We also present a novel model-fitting algorithm based on the generalized EM algorithm and the concept of Pareto improvement. We empirically demonstrate the success of DTM in terms of unsupervised clustering and semisupervised classification accuracies on text corpora and robustness to parameters compared to state-of-the-art techniques.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gomez-Rodriguez:2012:IND, author = "Manuel Gomez-Rodriguez and Jure Leskovec and Andreas Krause", title = "Inferring Networks of Diffusion and Influence", journal = j-TKDD, volume = "5", number = "4", pages = "21:1--21:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086741", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Information diffusion and virus propagation are fundamental processes taking place in networks. While it is often possible to directly observe when nodes become infected with a virus or publish the information, observing individual transmissions (who infects whom, or who influences whom) is typically very difficult. Furthermore, in many applications, the underlying network over which the diffusions and propagations spread is actually unobserved. We tackle these challenges by developing a method for tracing paths of diffusion and influence through networks and inferring the networks over which contagions propagate. Given the times when nodes adopt pieces of information or become infected, we identify the optimal network that best explains the observed infection times. Since the optimization problem is NP-hard to solve exactly, we develop an efficient approximation algorithm that scales to large datasets and finds provably near-optimal networks. We demonstrate the effectiveness of our approach by tracing information diffusion in a set of 170 million blogs and news articles over a one year period to infer how information flows through the online media space. We find that the diffusion network of news for the top 1,000 media sites and blogs tends to have a core-periphery structure with a small set of core media sites that diffuse information to the rest of the Web. These sites tend to have stable circles of influence with more general news media sites acting as connectors between them.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2012:LIS, author = "Jianhui Chen and Ji Liu and Jieping Ye", title = "Learning Incoherent Sparse and Low-Rank Patterns from Multiple Tasks", journal = j-TKDD, volume = "5", number = "4", pages = "22:1--22:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086742", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We consider the problem of learning incoherent sparse and low-rank patterns from multiple tasks. Our approach is based on a linear multitask learning formulation, in which the sparse and low-rank patterns are induced by a cardinality regularization term and a low-rank constraint, respectively. This formulation is nonconvex; we convert it into its convex surrogate, which can be routinely solved via semidefinite programming for small-size problems. We propose employing the general projected gradient scheme to efficiently solve such a convex surrogate; however, in the optimization formulation, the objective function is nondifferentiable and the feasible domain is nontrivial. We present the procedures for computing the projected gradient and ensuring the global convergence of the projected gradient scheme. The computation of the projected gradient involves a constrained optimization problem; we show that the optimal solution to such a problem can be obtained via solving an unconstrained optimization subproblem and a Euclidean projection subproblem. We also present two projected gradient algorithms and analyze their rates of convergence in detail. In addition, we illustrate the use of the presented projected gradient algorithms for the proposed multitask learning formulation using the least squares loss. Experimental results on a collection of real-world data sets demonstrate the effectiveness of the proposed multitask learning formulation and the efficiency of the proposed projected gradient algorithms.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2012:LLC, author = "Hsiang-Fu Yu and Cho-Jui Hsieh and Kai-Wei Chang and Chih-Jen Lin", title = "Large Linear Classification When Data Cannot Fit in Memory", journal = j-TKDD, volume = "5", number = "4", pages = "23:1--23:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086743", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recent advances in linear classification have shown that for applications such as document classification, the training process can be extremely efficient. However, most of the existing training methods are designed by assuming that data can be stored in the computer memory. These methods cannot be easily applied to data larger than the memory capacity due to the random access to the disk. We propose and analyze a block minimization framework for data larger than the memory size. At each step a block of data is loaded from the disk and handled by certain learning methods. We investigate two implementations of the proposed framework for primal and dual SVMs, respectively. Because data cannot fit in memory, many design considerations are very different from those for traditional algorithms. We discuss and compare with existing approaches that are able to handle data larger than memory. Experiments using data sets 20 times larger than the memory demonstrate the effectiveness of the proposed method.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Shahaf:2012:CTL, author = "Dafna Shahaf and Carlos Guestrin", title = "Connecting Two (or Less) Dots: Discovering Structure in News Articles", journal = j-TKDD, volume = "5", number = "4", pages = "24:1--24:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086744", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Finding information is becoming a major part of our daily life. Entire sectors, from Web users to scientists and intelligence analysts, are increasingly struggling to keep up with the larger and larger amounts of content published every day. With this much data, it is often easy to miss the big picture. In this article, we investigate methods for automatically connecting the dots---providing a structured, easy way to navigate within a new topic and discover hidden connections. We focus on the news domain: given two news articles, our system automatically finds a coherent chain linking them together. For example, it can recover the chain of events starting with the decline of home prices (January 2007), and ending with the health care debate (2009). We formalize the characteristics of a good chain and provide a fast search-driven algorithm to connect two fixed endpoints. We incorporate user feedback into our framework, allowing the stories to be refined and personalized. We also provide a method to handle partially-specified endpoints, for users who do not know both ends of a story. Finally, we evaluate our algorithm over real news data. Our user studies demonstrate that the objective we propose captures the users' intuitive notion of coherence, and that our algorithm effectively helps users understand the news.", acknowledgement = ack-nhfb, articleno = "24", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ienco:2012:CDL, author = "Dino Ienco and Ruggero G. Pensa and Rosa Meo", title = "From Context to Distance: Learning Dissimilarity for Categorical Data Clustering", journal = j-TKDD, volume = "6", number = "1", pages = "1:1--1:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133361", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Clustering data described by categorical attributes is a challenging task in data mining applications. Unlike numerical attributes, it is difficult to define a distance between pairs of values of a categorical attribute, since the values are not ordered. In this article, we propose a framework to learn a context-based distance for categorical attributes. The key intuition of this work is that the distance between two values of a categorical attribute A$_i$ can be determined by the way in which the values of the other attributes A$_j$ are distributed in the dataset objects: if they are similarly distributed in the groups of objects in correspondence of the distinct values of A$_i$ a low value of distance is obtained. We propose also a solution to the critical point of the choice of the attributes A$_j$. We validate our approach by embedding our distance learning framework in a hierarchical clustering algorithm. We applied it on various real world and synthetic datasets, both low and high-dimensional. Experimental results show that our method is competitive with respect to the state of the art of categorical data clustering approaches. We also show that our approach is scalable and has a low impact on the overall computational time of a clustering task.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Li:2012:EMG, author = "Chun Li and Qingyan Yang and Jianyong Wang and Ming Li", title = "Efficient Mining of Gap-Constrained Subsequences and Its Various Applications", journal = j-TKDD, volume = "6", number = "1", pages = "2:1--2:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133362", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Mining frequent subsequence patterns is a typical data-mining problem and various efficient sequential pattern mining algorithms have been proposed. In many application domains (e.g., biology), the frequent subsequences confined by the predefined gap requirements are more meaningful than the general sequential patterns. In this article, we propose two algorithms, Gap-BIDE for mining closed gap-constrained subsequences from a set of input sequences, and Gap-Connect for mining repetitive gap-constrained subsequences from a single input sequence. Inspired by some state-of-the-art closed or constrained sequential pattern mining algorithms, the Gap-BIDE algorithm adopts an efficient approach to finding the complete set of closed sequential patterns with gap constraints, while the Gap-Connect algorithm efficiently mines an approximate set of long patterns by connecting short patterns. We also present several methods for feature selection from the set of gap-constrained patterns for the purpose of classification and clustering. Our extensive performance study shows that our approaches are very efficient in mining frequent subsequences with gap constraints, and the gap-constrained pattern based classification/clustering approaches can achieve high-quality results.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2012:IBA, author = "Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou", title = "Isolation-Based Anomaly Detection", journal = j-TKDD, volume = "6", number = "1", pages = "3:1--3:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133363", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Anomalies are data points that are few and different. As a result of these properties, we show that, anomalies are susceptible to a mechanism called isolation. This article proposes a method called Isolation Forest ($i$ Forest), which detects anomalies purely based on the concept of isolation without employing any distance or density measure---fundamentally different from all existing methods. As a result, $i$ Forest is able to exploit subsampling (i) to achieve a low linear time-complexity and a small memory-requirement and (ii) to deal with the effects of swamping and masking effectively. Our empirical evaluation shows that $i$ Forest outperforms ORCA, one-class SVM, LOF and Random Forests in terms of AUC, processing time, and it is robust against masking and swamping effects. $i$ Forest also works well in high dimensional problems containing a large number of irrelevant attributes, and when anomalies are not available in training sample.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jin:2012:MML, author = "Yu Jin and Nick Duffield and Jeffrey Erman and Patrick Haffner and Subhabrata Sen and Zhi-Li Zhang", title = "A Modular Machine Learning System for Flow-Level Traffic Classification in Large Networks", journal = j-TKDD, volume = "6", number = "1", pages = "4:1--4:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133364", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The ability to accurately and scalably classify network traffic is of critical importance to a wide range of management tasks of large networks, such as tier-1 ISP networks and global enterprise networks. Guided by the practical constraints and requirements of traffic classification in large networks, in this article, we explore the design of an accurate and scalable machine learning based flow-level traffic classification system, which is trained on a dataset of flow-level data that has been annotated with application protocol labels by a packet-level classifier. Our system employs a lightweight modular architecture, which combines a series of simple linear binary classifiers, each of which can be efficiently implemented and trained on vast amounts of flow data in parallel, and embraces three key innovative mechanisms, weighted threshold sampling, logistic calibration, and intelligent data partitioning, to achieve scalability while attaining high accuracy. Evaluations using real traffic data from multiple locations in a large ISP show that our system accurately reproduces the labels of the packet level classifier when runs on (unlabeled) flow records, while meeting the scalability and stability requirements of large ISP networks. Using training and test datasets that are two months apart and collected from two different locations, the flow error rates are only 3\% for TCP flows and 0.4\% for UDP flows. We further show that such error rates can be reduced by combining the information of spatial distributions of flows, or collective traffic statistics, during classification. We propose a novel two-step model, which seamlessly integrates these collective traffic statistics into the existing traffic classification system. Experimental results display performance improvement on all traffic classes and an overall error rate reduction by 15\%. In addition to a high accuracy, at runtime, our implementation easily scales to classify traffic on 10Gbps links.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mavroeidis:2012:SSF, author = "Dimitrios Mavroeidis and Panagis Magdalinos", title = "A Sequential Sampling Framework for Spectral $k$-Means Based on Efficient Bootstrap Accuracy Estimations: Application to Distributed Clustering", journal = j-TKDD, volume = "6", number = "2", pages = "5:1--5:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297457", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The scalability of learning algorithms has always been a central concern for data mining researchers, and nowadays, with the rapid increase in data storage capacities and availability, its importance has increased. To this end, sampling has been studied by several researchers in an effort to derive sufficiently accurate models using only small data fractions. In this article we focus on spectral $k$-means, that is, the $k$-means approximation as derived by the spectral relaxation, and propose a sequential sampling framework that iteratively enlarges the sample size until the $k$-means results (objective function and cluster structure) become indistinguishable from the asymptotic (infinite-data) output. In the proposed framework we adopt a commonly applied principle in data mining research that considers the use of minimal assumptions concerning the data generating distribution. This restriction imposes several challenges, mainly related to the efficiency of the sequential sampling procedure. These challenges are addressed using elements of matrix perturbation theory and statistics. Moreover, although the main focus is on spectral $k$-means, we also demonstrate that the proposed framework can be generalized to handle spectral clustering. The proposed sequential sampling framework is consecutively employed for addressing the distributed clustering problem, where the task is to construct a global model for data that resides in distributed network nodes. The main challenge in this context is related to the bandwidth constraints that are commonly imposed, thus requiring that the distributed clustering algorithm consumes a minimal amount of network load. This illustrates the applicability of the proposed approach, as it enables the determination of a minimal sample size that can be used for constructing an accurate clustering model that entails the distributional characteristics of the data. As opposed to the relevant distributed $k$-means approaches, our framework takes into account the fact that the choice of the number of clusters has a crucial effect on the required amount of communication. More precisely, the proposed algorithm is able to derive a statistical estimation of the required relative sizes for all possible values of $k$. This unique feature of our distributed clustering framework enables a network administrator to choose an economic solution that identifies the crude cluster structure of a dataset and not devote excessive network resources for identifying all the ``correct'' detailed clusters.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Das:2012:MIG, author = "Sanmay Das and Malik Magdon-Ismail", title = "A Model for Information Growth in Collective Wisdom Processes", journal = j-TKDD, volume = "6", number = "2", pages = "6:1--6:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297458", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Collaborative media such as wikis have become enormously successful venues for information creation. Articles accrue information through the asynchronous editing of users who arrive both seeking information and possibly able to contribute information. Most articles stabilize to high-quality, trusted sources of information representing the collective wisdom of all the users who edited the article. We propose a model for information growth which relies on two main observations: (i) as an article's quality improves, it attracts visitors at a faster rate (a rich-get-richer phenomenon); and, simultaneously, (ii) the chances that a new visitor will improve the article drops (there is only so much that can be said about a particular topic). Our model is able to reproduce many features of the edit dynamics observed on Wikipedia; in particular, it captures the observed rise in the edit rate, followed by $1/ t$ decay. Despite differences in the media, we also document similar features in the comment rates for a segment of the LiveJournal blogosphere.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xu:2012:GME, author = "Tianbing Xu and Zhongfei Zhang and Philip S. Yu and Bo Long", title = "Generative Models for Evolutionary Clustering", journal = j-TKDD, volume = "6", number = "2", pages = "7:1--7:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297459", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article studies evolutionary clustering, a recently emerged hot topic with many important applications, noticeably in dynamic social network analysis. In this article, based on the recent literature on nonparametric Bayesian models, we have developed two generative models: DPChain and HDP-HTM. DPChain is derived from the Dirichlet process mixture (DPM) model, with an exponential decaying component along with the time. HDP-HTM combines the hierarchical dirichlet process (HDP) with a hierarchical transition matrix (HTM) based on the proposed Infinite hierarchical Markov state model (iHMS). Both models substantially advance the literature on evolutionary clustering, in the sense that not only do they both perform better than those in the existing literature, but more importantly, they are capable of automatically learning the cluster numbers and explicitly addressing the corresponding issues. Extensive evaluations have demonstrated the effectiveness and the promise of these two solutions compared to the state-of-the-art literature.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2012:LME, author = "Shaojun Wang and Dale Schuurmans and Yunxin Zhao", title = "The Latent Maximum Entropy Principle", journal = j-TKDD, volume = "6", number = "2", pages = "8:1--8:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297460", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present an extension to Jaynes' maximum entropy principle that incorporates latent variables. The principle of latent maximum entropy we propose is different from both Jaynes' maximum entropy principle and maximum likelihood estimation, but can yield better estimates in the presence of hidden variables and limited training data. We first show that solving for a latent maximum entropy model poses a hard nonlinear constrained optimization problem in general. However, we then show that feasible solutions to this problem can be obtained efficiently for the special case of log-linear models---which forms the basis for an efficient approximation to the latent maximum entropy principle. We derive an algorithm that combines expectation-maximization with iterative scaling to produce feasible log-linear solutions. This algorithm can be interpreted as an alternating minimization algorithm in the information divergence, and reveals an intimate connection between the latent maximum entropy and maximum likelihood principles. To select a final model, we generate a series of feasible candidates, calculate the entropy of each, and choose the model that attains the highest entropy. Our experimental results show that estimation based on the latent maximum entropy principle generally gives better results than maximum likelihood when estimating latent variable models on small observed data samples.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bhattacharya:2012:CGC, author = "Indrajit Bhattacharya and Shantanu Godbole and Sachindra Joshi and Ashish Verma", title = "Cross-Guided Clustering: Transfer of Relevant Supervision across Tasks", journal = j-TKDD, volume = "6", number = "2", pages = "9:1--9:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297461", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Lack of supervision in clustering algorithms often leads to clusters that are not useful or interesting to human reviewers. We investigate if supervision can be automatically transferred for clustering a target task, by providing a relevant supervised partitioning of a dataset from a different source task. The target clustering is made more meaningful for the human user by trading-off intrinsic clustering goodness on the target task for alignment with relevant supervised partitions in the source task, wherever possible. We propose a cross-guided clustering algorithm that builds on traditional k-means by aligning the target clusters with source partitions. The alignment process makes use of a cross-task similarity measure that discovers hidden relationships across tasks. When the source and target tasks correspond to different domains with potentially different vocabularies, we propose a projection approach using pivot vocabularies for the cross-domain similarity measure. Using multiple real-world and synthetic datasets, we show that our approach improves clustering accuracy significantly over traditional k-means and state-of-the-art semi-supervised clustering baselines, over a wide range of data characteristics and parameter settings.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2012:LBN, author = "Zhenxing Wang and Laiwan Chan", title = "Learning {Bayesian} networks from {Markov} random fields: an efficient algorithm for linear models", journal = j-TKDD, volume = "6", number = "3", pages = "10:1--10:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362384", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Dependency analysis is a typical approach for Bayesian network learning, which infers the structures of Bayesian networks by the results of a series of conditional independence (CI) tests. In practice, testing independence conditioning on large sets hampers the performance of dependency analysis algorithms in terms of accuracy and running time for the following reasons. First, testing independence on large sets of variables with limited samples is not stable. Second, for most dependency analysis algorithms, the number of CI tests grows at an exponential rate with the sizes of conditioning sets, and the running time grows of the same rate. Therefore, determining how to reduce the number of CI tests and the sizes of conditioning sets becomes a critical step in dependency analysis algorithms. In this article, we address a two-phase algorithm based on the observation that the structures of Markov random fields are similar to those of Bayesian networks. The first phase of the algorithm constructs a Markov random field from data, which provides a close approximation to the structure of the true Bayesian network; the second phase of the algorithm removes redundant edges according to CI tests to get the true Bayesian network. Both phases use Markov blanket information to reduce the sizes of conditioning sets and the number of CI tests without sacrificing accuracy. An empirical study shows that the two-phase algorithm performs well in terms of accuracy and efficiency.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chan:2012:CID, author = "Jeffrey Chan and James Bailey and Christopher Leckie and Michael Houle", title = "{ciForager}: Incrementally discovering regions of correlated change in evolving graphs", journal = j-TKDD, volume = "6", number = "3", pages = "11:1--11:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362385", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Data mining techniques for understanding how graphs evolve over time have become increasingly important. Evolving graphs arise naturally in diverse applications such as computer network topologies, multiplayer games and medical imaging. A natural and interesting problem in evolving graph analysis is the discovery of compact subgraphs that change in a similar manner. Such subgraphs are known as regions of correlated change and they can both summarise change patterns in graphs and help identify the underlying events causing these changes. However, previous techniques for discovering regions of correlated change suffer from limited scalability, making them unsuitable for analysing the evolution of very large graphs. In this paper, we introduce a new algorithm called ciForager, that addresses this scalability challenge and offers considerable improvements. The efficiency of ciForager is based on the use of new incremental techniques for detecting change, as well as the use of Voronoi representations for efficiently determining distance. We experimentally show that ciForager can achieve speedups of up to 1000 times over previous approaches. As a result, it becomes feasible for the first time to discover regions of correlated change in extremely large graphs, such as the entire BGP routing topology of the Internet.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2012:CDS, author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong Gong", title = "Comparative document summarization via discriminative sentence selection", journal = j-TKDD, volume = "6", number = "3", pages = "12:1--12:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362386", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given a collection of document groups, a natural question is to identify the differences among them. Although traditional document summarization techniques can summarize the content of the document groups one by one, there exists a great necessity to generate a summary of the differences among the document groups. In this article, we study a novel problem, that of summarizing the differences between document groups. A discriminative sentence selection method is proposed to extract the most discriminative sentences which represent the specific characteristics of each document group. Experiments and case studies on real-world data sets demonstrate the effectiveness of our proposed method.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{deMelo:2012:FNO, author = "Pedro O. S. {Vaz de Melo} and Virgilio A. F. Almeida and Antonio A. F. Loureiro and Christos Faloutsos", title = "Forecasting in the {NBA} and other team sports: Network effects in action", journal = j-TKDD, volume = "6", number = "3", pages = "13:1--13:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362387", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The multi-million sports-betting market is based on the fact that the task of predicting the outcome of a sports event is very hard. Even with the aid of an uncountable number of descriptive statistics and background information, only a few can correctly guess the outcome of a game or a league. In this work, our approach is to move away from the traditional way of predicting sports events, and instead to model sports leagues as networks of players and teams where the only information available is the work relationships among them. We propose two network-based models to predict the behavior of teams in sports leagues. These models are parameter-free, that is, they do not have a single parameter, and moreover are sport-agnostic: they can be applied directly to any team sports league. First, we view a sports league as a network in evolution, and we infer the implicit feedback behind network changes and properties over the years. Then, we use this knowledge to construct the network-based prediction models, which can, with a significantly high probability, indicate how well a team will perform over a season. We compare our proposed models with other prediction models in two of the most popular sports leagues: the National Basketball Association (NBA) and the Major League Baseball (MLB). Our model shows consistently good results in comparison with the other models and, relying upon the network properties of the teams, we achieved a $\approx 14\%$ rank prediction accuracy improvement over our best competitor.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ghosh:2012:SIB, author = "Joydeep Ghosh and Padhraic Smyth and Andrew Tomkins and Rich Caruana", title = "Special issue on best of {SIGKDD 2011}", journal = j-TKDD, volume = "6", number = "4", pages = "14:1--14:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382578", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Kaufman:2012:LDM, author = "Shachar Kaufman and Saharon Rosset and Claudia Perlich and Ori Stitelman", title = "Leakage in data mining: Formulation, detection, and avoidance", journal = j-TKDD, volume = "6", number = "4", pages = "15:1--15:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382579", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Deemed ``one of the top ten data mining mistakes'', leakage is the introduction of information about the data mining target that should not be legitimately available to mine from. In addition to our own industry experience with real-life projects, controversies around several major public data mining competitions held recently such as the INFORMS 2010 Data Mining Challenge and the IJCNN 2011 Social Network Challenge are evidence that this issue is as relevant today as it has ever been. While acknowledging the importance and prevalence of leakage in both synthetic competitions and real-life data mining projects, existing literature has largely left this idea unexplored. What little has been said turns out not to be broad enough to cover more complex cases of leakage, such as those where the classical independently and identically distributed (i.i.d.) assumption is violated, that have been recently documented. In our new approach, these cases and others are explained by explicitly defining modeling goals and analyzing the broader framework of the data mining problem. The resulting definition enables us to derive general methodology for dealing with the issue. We show that it is possible to avoid leakage with a simple specific approach to data management followed by what we call a learn-predict separation, and present several ways of detecting leakage when the modeler has no control over how the data have been collected. We also offer an alternative point of view on leakage that is based on causal graph modeling concepts.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mampaey:2012:SDS, author = "Michael Mampaey and Jilles Vreeken and Nikolaj Tatti", title = "Summarizing data succinctly with the most informative itemsets", journal = j-TKDD, volume = "6", number = "4", pages = "16:1--16:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382580", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Knowledge discovery from data is an inherently iterative process. That is, what we know about the data greatly determines our expectations, and therefore, what results we would find interesting and/or surprising. Given new knowledge about the data, our expectations will change. Hence, in order to avoid redundant results, knowledge discovery algorithms ideally should follow such an iterative updating procedure. With this in mind, we introduce a well-founded approach for succinctly summarizing data with the most informative itemsets; using a probabilistic maximum entropy model, we iteratively find the itemset that provides us the most novel information-that is, for which the frequency in the data surprises us the most-and in turn we update our model accordingly. As we use the maximum entropy principle to obtain unbiased probabilistic models, and only include those itemsets that are most informative with regard to the current model, the summaries we construct are guaranteed to be both descriptive and nonredundant. The algorithm that we present, called mtv, can either discover the top- k most informative itemsets, or we can employ either the Bayesian Information Criterion (bic) or the Minimum Description Length (mdl) principle to automatically identify the set of itemsets that together summarize the data well. In other words, our method will ``tell you what you need to know'' about the data. Importantly, it is a one-phase algorithm: rather than picking itemsets from a user-provided candidate set, itemsets and their supports are mined on-the-fly. To further its applicability, we provide an efficient method to compute the maximum entropy distribution using Quick Inclusion-Exclusion. Experiments on our method, using synthetic, benchmark, and real data, show that the discovered summaries are succinct, and correctly identify the key patterns in the data. The models they form attain high likelihoods, and inspection shows that they summarize the data well with increasingly specific, yet nonredundant itemsets.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chu:2012:TLM, author = "Shumo Chu and James Cheng", title = "Triangle listing in massive networks", journal = j-TKDD, volume = "6", number = "4", pages = "17:1--17:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382581", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Triangle listing is one of the fundamental algorithmic problems whose solution has numerous applications especially in the analysis of complex networks, such as the computation of clustering coefficients, transitivity, triangular connectivity, trusses, etc. Existing algorithms for triangle listing are mainly in-memory algorithms, whose performance cannot scale with the massive volume of today's fast growing networks. When the input graph cannot fit in main memory, triangle listing requires random disk accesses that can incur prohibitively huge I/O cost. Some streaming, semistreaming, and sampling algorithms have been proposed but these are approximation algorithms. We propose an I/O-efficient algorithm for triangle listing. Our algorithm is exact and avoids random disk access. Our results show that our algorithm is scalable and outperforms the state-of-the-art in-memory and local triangle estimation algorithms.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chattopadhyay:2012:MDA, author = "Rita Chattopadhyay and Qian Sun and Wei Fan and Ian Davidson and Sethuraman Panchanathan and Jieping Ye", title = "Multisource domain adaptation and its application to early detection of fatigue", journal = j-TKDD, volume = "6", number = "4", pages = "18:1--18:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382582", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We consider the characterization of muscle fatigue through a noninvasive sensing mechanism such as Surface ElectroMyoGraphy (SEMG). While changes in the properties of SEMG signals with respect to muscle fatigue have been reported in the literature, the large variation in these signals across different individuals makes the task of modeling and classification of SEMG signals challenging. Indeed, the variation in SEMG parameters from subject to subject creates differences in the data distribution. In this article, we propose two transfer learning frameworks based on the multisource domain adaptation methodology for detecting different stages of fatigue using SEMG signals, that addresses the distribution differences. In the proposed frameworks, the SEMG data of a subject represent a domain; data from multiple subjects in the training set form the multiple source domains and the test subject data form the target domain. SEMG signals are predominantly different in conditional probability distribution across subjects. The key feature of the first framework is a novel weighting scheme that addresses the conditional probability distribution differences across multiple domains (subjects) and the key feature of the second framework is a two-stage domain adaptation methodology which combines weighted data from multiple sources based on marginal probability differences (first stage) as well as conditional probability differences (second stage), with the target domain data. The weights for minimizing the marginal probability differences are estimated independently, while the weights for minimizing conditional probability differences are computed simultaneously by exploiting the potential interaction among multiple sources. We also provide a theoretical analysis on the generalization performance of the proposed multisource domain adaptation formulation using the weighted Rademacher complexity measure. We have validated the proposed frameworks on Surface ElectroMyoGram signals collected from 8 people during a fatigue-causing repetitive gripping activity. Comprehensive experiments on the SEMG dataset demonstrate that the proposed method improves the classification accuracy by 20\% to 30\% over the cases without any domain adaptation method and by 13\% to 30\% over existing state-of-the-art domain adaptation methods.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wilkinson:2012:SIS, author = "Leland Wilkinson and Anushka Anand and Tuan Nhon Dang", title = "Substantial improvements in the set-covering projection classifier {CHIRP} (composite hypercubes on iterated random projections)", journal = j-TKDD, volume = "6", number = "4", pages = "19:1--19:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382583", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In Wilkinson et al. [2011] we introduced a new set-covering random projection classifier that achieved average error lower than that of other classifiers in the Weka platform. This classifier was based on an $L^\infty$ norm distance function and exploited an iterative sequence of three stages (projecting, binning, and covering) to deal with the curse of dimensionality, computational complexity, and nonlinear separability. We now present substantial changes that improve robustness and reduce training and testing time by almost an order of magnitude without jeopardizing CHIRP's outstanding error performance.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Angiulli:2013:NNB, author = "Fabrizio Angiulli and Fabio Fassetti", title = "Nearest Neighbor-Based Classification of Uncertain Data", journal = j-TKDD, volume = "7", number = "1", pages = "1:1--1:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435210", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This work deals with the problem of classifying uncertain data. With this aim we introduce the Uncertain Nearest Neighbor (UNN) rule, which represents the generalization of the deterministic nearest neighbor rule to the case in which uncertain objects are available. The UNN rule relies on the concept of nearest neighbor class, rather than on that of nearest neighbor object. The nearest neighbor class of a test object is the class that maximizes the probability of providing its nearest neighbor. The evidence is that the former concept is much more powerful than the latter in the presence of uncertainty, in that it correctly models the right semantics of the nearest neighbor decision rule when applied to the uncertain scenario. An effective and efficient algorithm to perform uncertain nearest neighbor classification of a generic (un)certain test object is designed, based on properties that greatly reduce the temporal cost associated with nearest neighbor class probability computation. Experimental results are presented, showing that the UNN rule is effective and efficient in classifying uncertain data.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2013:CDS, author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong Gong", title = "Comparative Document Summarization via Discriminative Sentence Selection", journal = j-TKDD, volume = "7", number = "1", pages = "2:1--2:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435211", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given a collection of document groups, a natural question is to identify the differences among these groups. Although traditional document summarization techniques can summarize the content of the document groups one by one, there exists a great necessity to generate a summary of the differences among the document groups. In this article, we study a novel problem of summarizing the differences between document groups. A discriminative sentence selection method is proposed to extract the most discriminative sentences that represent the specific characteristics of each document group. Experiments and case studies on real-world data sets demonstrate the effectiveness of our proposed method.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bayati:2013:MPA, author = "Mohsen Bayati and David F. Gleich and Amin Saberi and Ying Wang", title = "Message-Passing Algorithms for Sparse Network Alignment", journal = j-TKDD, volume = "7", number = "1", pages = "3:1--3:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435212", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Network alignment generalizes and unifies several approaches for forming a matching or alignment between the vertices of two graphs. We study a mathematical programming framework for network alignment problem and a sparse variation of it where only a small number of matches between the vertices of the two graphs are possible. We propose a new message passing algorithm that allows us to compute, very efficiently, approximate solutions to the sparse network alignment problems with graph sizes as large as hundreds of thousands of vertices. We also provide extensive simulations comparing our algorithms with two of the best solvers for network alignment problems on two synthetic matching problems, two bioinformatics problems, and three large ontology alignment problems including a multilingual problem with a known labeled alignment.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Li:2013:CWM, author = "Bin Li and Steven C. H. Hoi and Peilin Zhao and Vivekanand Gopalkrishnan", title = "Confidence Weighted Mean Reversion Strategy for Online Portfolio Selection", journal = j-TKDD, volume = "7", number = "1", pages = "4:1--4:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435213", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Online portfolio selection has been attracting increasing attention from the data mining and machine learning communities. All existing online portfolio selection strategies focus on the first order information of a portfolio vector, though the second order information may also be beneficial to a strategy. Moreover, empirical evidence shows that relative stock prices may follow the mean reversion property, which has not been fully exploited by existing strategies. This article proposes a novel online portfolio selection strategy named Confidence Weighted Mean Reversion (CWMR). Inspired by the mean reversion principle in finance and confidence weighted online learning technique in machine learning, CWMR models the portfolio vector as a Gaussian distribution, and sequentially updates the distribution by following the mean reversion trading principle. CWMR's closed-form updates clearly reflect the mean reversion trading idea. We also present several variants of CWMR algorithms, including a CWMR mixture algorithm that is theoretical universal. Empirically, CWMR strategy is able to effectively exploit the power of mean reversion for online portfolio selection. Extensive experiments on various real markets show that the proposed strategy is superior to the state-of-the-art techniques. The experimental testbed including source codes and data sets is available online.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lou:2013:LPR, author = "Tiancheng Lou and Jie Tang and John Hopcroft and Zhanpeng Fang and Xiaowen Ding", title = "Learning to predict reciprocity and triadic closure in social networks", journal = j-TKDD, volume = "7", number = "2", pages = "5:1--5:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499908", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study how links are formed in social networks. In particular, we focus on investigating how a reciprocal (two-way) link, the basic relationship in social networks, is developed from a parasocial (one-way) relationship and how the relationships further develop into triadic closure, one of the fundamental processes of link formation. We first investigate how geographic distance and interactions between users influence the formation of link structure among users. Then we study how social theories including homophily, social balance, and social status are satisfied over networks with parasocial and reciprocal relationships. The study unveils several interesting phenomena. For example, ``friend's friend is a friend'' indeed exists in the reciprocal relationship network, but does not hold in the parasocial relationship network. We propose a learning framework to formulate the problems of predicting reciprocity and triadic closure into a graphical model. We demonstrate that it is possible to accurately infer 90\% of reciprocal relationships in a Twitter network. The proposed model also achieves better performance (+20--30\% in terms of F1-measure) than several alternative methods for predicting the triadic closure formation.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yang:2013:EOL, author = "Haiqin Yang and Michael R. Lyu and Irwin King", title = "Efficient online learning for multitask feature selection", journal = j-TKDD, volume = "7", number = "2", pages = "6:1--6:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499909", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Learning explanatory features across multiple related tasks, or MultiTask Feature Selection (MTFS), is an important problem in the applications of data mining, machine learning, and bioinformatics. Previous MTFS methods fulfill this task by batch-mode training. This makes them inefficient when data come sequentially or when the number of training data is so large that they cannot be loaded into the memory simultaneously. In order to tackle these problems, we propose a novel online learning framework to solve the MTFS problem. A main advantage of the online algorithm is its efficiency in both time complexity and memory cost. The weights of the MTFS models at each iteration can be updated by closed-form solutions based on the average of previous subgradients. This yields the worst-case bounds of the time complexity and memory cost at each iteration, both in the order of O ( d $ \times $ Q ), where d is the number of feature dimensions and Q is the number of tasks. Moreover, we provide theoretical analysis for the average regret of the online learning algorithms, which also guarantees the convergence rate of the algorithms. Finally, we conduct detailed experiments to show the characteristics and merits of the online learning algorithms in solving several MTFS problems.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2013:MRL, author = "Yu Zhang and Dit-Yan Yeung", title = "Multilabel relationship learning", journal = j-TKDD, volume = "7", number = "2", pages = "7:1--7:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499910", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multilabel learning problems are commonly found in many applications. A characteristic shared by many multilabel learning problems is that some labels have significant correlations between them. In this article, we propose a novel multilabel learning method, called MultiLabel Relationship Learning (MLRL), which extends the conventional support vector machine by explicitly learning and utilizing the relationships between labels. Specifically, we model the label relationships using a label covariance matrix and use it to define a new regularization term for the optimization problem. MLRL learns the model parameters and the label covariance matrix simultaneously based on a unified convex formulation. To solve the convex optimization problem, we use an alternating method in which each subproblem can be solved efficiently. The relationship between MLRL and two widely used maximum margin methods for multilabel learning is investigated. Moreover, we also propose a semisupervised extension of MLRL, called SSMLRL, to demonstrate how to make use of unlabeled data to help learn the label covariance matrix. Through experiments conducted on some multilabel applications, we find that MLRL not only gives higher classification accuracy but also has better interpretability as revealed by the label covariance matrix.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Peng:2013:EFF, author = "Jing Peng and Guna Seetharaman and Wei Fan and Aparna Varde", title = "Exploiting {Fisher} and {Fukunaga--Koontz} transforms in {Chernoff} dimensionality reduction", journal = j-TKDD, volume = "7", number = "2", pages = "8:1--8:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499911", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Knowledge discovery from big data demands effective representation of data. However, big data are often characterized by high dimensionality, which makes knowledge discovery more difficult. Many techniques for dimensionality reduction have been proposed, including well-known Fisher's Linear Discriminant Analysis (LDA). However, the Fisher criterion is incapable of dealing with heteroscedasticity in the data. A technique based on the Chernoff criterion for linear dimensionality reduction has been proposed that is capable of exploiting heteroscedastic information in the data. While the Chernoff criterion has been shown to outperform the Fisher's, a clear understanding of its exact behavior is lacking. In this article, we show precisely what can be expected from the Chernoff criterion. In particular, we show that the Chernoff criterion exploits the Fisher and Fukunaga-Koontz transforms in computing its linear discriminants. Furthermore, we show that a recently proposed decomposition of the data space into four subspaces is incomplete. We provide arguments on how to best enrich the decomposition of the data space in order to account for heteroscedasticity in the data. Finally, we provide experimental results validating our theoretical analysis.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Agarwal:2013:ISI, author = "Deepak Agarwal and Rich Caruana and Jian Pei and Ke Wang", title = "Introduction to the {Special Issue ACM SIGKDD 2012}", journal = j-TKDD, volume = "7", number = "3", pages = "9:1--9:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2513092.2513093", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Rakthanmanon:2013:ABD, author = "Thanawin Rakthanmanon and Bilson Campana and Abdullah Mueen and Gustavo Batista and Brandon Westover and Qiang Zhu and Jesin Zakaria and Eamonn Keogh", title = "Addressing Big Data Time Series: Mining Trillions of Time Series Subsequences Under Dynamic Time Warping", journal = j-TKDD, volume = "7", number = "3", pages = "10:1--10:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500489", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Most time series data mining algorithms use similarity search as a core subroutine, and thus the time taken for similarity search is the bottleneck for virtually all time series data mining algorithms, including classification, clustering, motif discovery, anomaly detection, and so on. The difficulty of scaling a search to large datasets explains to a great extent why most academic work on time series data mining has plateaued at considering a few millions of time series objects, while much of industry and science sits on billions of time series objects waiting to be explored. In this work we show that by using a combination of four novel ideas we can search and mine massive time series for the first time. We demonstrate the following unintuitive fact: in large datasets we can exactly search under Dynamic Time Warping (DTW) much more quickly than the current state-of-the-art Euclidean distance search algorithms. We demonstrate our work on the largest set of time series experiments ever attempted. In particular, the largest dataset we consider is larger than the combined size of all of the time series datasets considered in all data mining papers ever published. We explain how our ideas allow us to solve higher-level time series data mining problems such as motif discovery and clustering at scales that would otherwise be untenable. Moreover, we show how our ideas allow us to efficiently support the uniform scaling distance measure, a measure whose utility seems to be underappreciated, but which we demonstrate here. In addition to mining massive datasets with up to one trillion datapoints, we will show that our ideas also have implications for real-time monitoring of data streams, allowing us to handle much faster arrival rates and/or use cheaper and lower powered devices than are currently possible.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Sun:2013:PIM, author = "Yizhou Sun and Brandon Norick and Jiawei Han and Xifeng Yan and Philip S. Yu and Xiao Yu", title = "{PathSelClus}: Integrating Meta-Path Selection with User-Guided Object Clustering in Heterogeneous Information Networks", journal = j-TKDD, volume = "7", number = "3", pages = "11:1--11:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500492", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Real-world, multiple-typed objects are often interconnected, forming heterogeneous information networks. A major challenge for link-based clustering in such networks is their potential to generate many different results, carrying rather diverse semantic meanings. In order to generate desired clustering, we propose to use meta-path, a path that connects object types via a sequence of relations, to control clustering with distinct semantics. Nevertheless, it is easier for a user to provide a few examples (seeds) than a weighted combination of sophisticated meta-paths to specify her clustering preference. Thus, we propose to integrate meta-path selection with user-guided clustering to cluster objects in networks, where a user first provides a small set of object seeds for each cluster as guidance. Then the system learns the weight for each meta-path that is consistent with the clustering result implied by the guidance, and generates clusters under the learned weights of meta-paths. A probabilistic approach is proposed to solve the problem, and an effective and efficient iterative algorithm, PathSelClus, is proposed to learn the model, where the clustering quality and the meta-path weights mutually enhance each other. Our experiments with several clustering tasks in two real networks and one synthetic network demonstrate the power of the algorithm in comparison with the baselines.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bellare:2013:ASE, author = "Kedar Bellare and Suresh Iyengar and Aditya Parameswaran and Vibhor Rastogi", title = "Active Sampling for Entity Matching with Guarantees", journal = j-TKDD, volume = "7", number = "3", pages = "12:1--12:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500490", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In entity matching, a fundamental issue while training a classifier to label pairs of entities as either duplicates or nonduplicates is the one of selecting informative training examples. Although active learning presents an attractive solution to this problem, previous approaches minimize the misclassification rate (0--1 loss) of the classifier, which is an unsuitable metric for entity matching due to class imbalance (i.e., many more nonduplicate pairs than duplicate pairs). To address this, a recent paper [Arasu et al. 2010] proposes to maximize recall of the classifier under the constraint that its precision should be greater than a specified threshold. However, the proposed technique requires the labels of all n input pairs in the worst case. Our main result is an active learning algorithm that approximately maximizes recall of the classifier while respecting a precision constraint with provably sublinear label complexity (under certain distributional assumptions). Our algorithm uses as a black box any active learning module that minimizes 0--1 loss. We show that label complexity of our algorithm is at most log n times the label complexity of the black box, and also bound the difference in the recall of classifier learnt by our algorithm and the recall of the optimal classifier satisfying the precision constraint. We provide an empirical evaluation of our algorithm on several real-world matching data sets that demonstrates the effectiveness of our approach.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chattopadhyay:2013:BMA, author = "Rita Chattopadhyay and Zheng Wang and Wei Fan and Ian Davidson and Sethuraman Panchanathan and Jieping Ye", title = "Batch Mode Active Sampling Based on Marginal Probability Distribution Matching", journal = j-TKDD, volume = "7", number = "3", pages = "13:1--13:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2513092.2513094", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Active Learning is a machine learning and data mining technique that selects the most informative samples for labeling and uses them as training data; it is especially useful when there are large amount of unlabeled data and labeling them is expensive. Recently, batch-mode active learning, where a set of samples are selected concurrently for labeling, based on their collective merit, has attracted a lot of attention. The objective of batch-mode active learning is to select a set of informative samples so that a classifier learned on these samples has good generalization performance on the unlabeled data. Most of the existing batch-mode active learning methodologies try to achieve this by selecting samples based on certain criteria. In this article we propose a novel criterion which achieves good generalization performance of a classifier by specifically selecting a set of query samples that minimize the difference in distribution between the labeled and the unlabeled data, after annotation. We explicitly measure this difference based on all candidate subsets of the unlabeled data and select the best subset. The proposed objective is an NP-hard integer programming optimization problem. We provide two optimization techniques to solve this problem. In the first one, the problem is transformed into a convex quadratic programming problem and in the second method the problem is transformed into a linear programming problem. Our empirical studies using publicly available UCI datasets and two biomedical image databases demonstrate the effectiveness of the proposed approach in comparison with the state-of-the-art batch-mode active learning methods. We also present two extensions of the proposed approach, which incorporate uncertainty of the predicted labels of the unlabeled data and transfer learning in the proposed formulation. In addition, we present a joint optimization framework for performing both transfer and active learning simultaneously unlike the existing approaches of learning in two separate stages, that is, typically, transfer learning followed by active learning. We specifically minimize a common objective of reducing distribution difference between the domain adapted source, the queried and labeled samples and the rest of the unlabeled target domain data. Our empirical studies on two biomedical image databases and on a publicly available 20 Newsgroups dataset show that incorporation of uncertainty information and transfer learning further improves the performance of the proposed active learning based classifier. Our empirical studies also show that the proposed transfer-active method based on the joint optimization framework performs significantly better than a framework which implements transfer and active learning in two separate stages.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Briggs:2013:IAM, author = "Forrest Briggs and Xiaoli Z. Fern and Raviv Raich and Qi Lou", title = "Instance Annotation for Multi-Instance Multi-Label Learning", journal = j-TKDD, volume = "7", number = "3", pages = "14:1--14:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500491", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multi-instance multi-label learning (MIML) is a framework for supervised classification where the objects to be classified are bags of instances associated with multiple labels. For example, an image can be represented as a bag of segments and associated with a list of objects it contains. Prior work on MIML has focused on predicting label sets for previously unseen bags. We instead consider the problem of predicting instance labels while learning from data labeled only at the bag level. We propose a regularized rank-loss objective designed for instance annotation, which can be instantiated with different aggregation models connecting instance-level labels with bag-level label sets. The aggregation models that we consider can be factored as a linear function of a ``support instance'' for each class, which is a single feature vector representing a whole bag. Hence we name our proposed methods rank-loss Support Instance Machines (SIM). We propose two optimization methods for the rank-loss objective, which is nonconvex. One is a heuristic method that alternates between updating support instances, and solving a convex problem in which the support instances are treated as constant. The other is to apply the constrained concave-convex procedure (CCCP), which can also be interpreted as iteratively updating support instances and solving a convex problem. To solve the convex problem, we employ the Pegasos framework of primal subgradient descent, and prove that it finds an $ \epsilon $-suboptimal solution in runtime that is linear in the number of bags, instances, and $ 1 / \epsilon $. Additionally, we suggest a method of extending the linear learning algorithm to nonlinear classification, without increasing the runtime asymptotically. Experiments on artificial and real-world datasets including images and audio show that the proposed methods achieve higher accuracy than other loss functions used in prior work, e.g., Hamming loss, and recent work in ambiguous label classification.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ji:2013:PFR, author = "Ming Ji and Binbin Lin and Xiaofei He and Deng Cai and Jiawei Han", title = "Parallel Field Ranking", journal = j-TKDD, volume = "7", number = "3", pages = "15:1--15:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2513092.2513096", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recently, ranking data with respect to the intrinsic geometric structure (manifold ranking) has received considerable attentions, with encouraging performance in many applications in pattern recognition, information retrieval and recommendation systems. Most of the existing manifold ranking methods focus on learning a ranking function that varies smoothly along the data manifold. However, beyond smoothness, a desirable ranking function should vary monotonically along the geodesics of the data manifold, such that the ranking order along the geodesics is preserved. In this article, we aim to learn a ranking function that varies linearly and therefore monotonically along the geodesics of the data manifold. Recent theoretical work shows that the gradient field of a linear function on the manifold has to be a parallel vector field. Therefore, we propose a novel ranking algorithm on the data manifolds, called Parallel Field Ranking. Specifically, we try to learn a ranking function and a vector field simultaneously. We require the vector field to be close to the gradient field of the ranking function, and the vector field to be as parallel as possible. Moreover, we require the value of the ranking function at the query point to be the highest, and then decrease linearly along the manifold. Experimental results on both synthetic data and real data demonstrate the effectiveness of our proposed algorithm.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Adali:2013:IPR, author = "Sibel Adali and Malik Magdon-Ismail and Xiaohui Lu", title = "{iHypR}: Prominence ranking in networks of collaborations with hyperedges 1", journal = j-TKDD, volume = "7", number = "4", pages = "16:1--16:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541269", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present a new algorithm called iHypR for computing prominence of actors in social networks of collaborations. Our algorithm builds on the assumption that prominent actors collaborate on prominent objects, and prominent objects are naturally grouped into prominent clusters or groups (hyperedges in a graph). iHypR makes use of the relationships between actors, objects, and hyperedges to compute a global prominence score for the actors in the network. We do not assume the hyperedges are given in advance. Hyperedges computed by our method can perform as well or even better than ``true'' hyperedges. Our algorithm is customized for networks of collaborations, but it is generally applicable without further tuning. We show, through extensive experimentation with three real-life data sets and multiple external measures of prominence, that our algorithm outperforms existing well-known algorithms. Our work is the first to offer such an extensive evaluation. We show that unlike most existing algorithms, the performance is robust across multiple measures of performance. Further, we give a detailed study of the sensitivity of our algorithm to different data sets and the design choices within the algorithm that a user may wish to change. Our article illustrates the various trade-offs that must be considered in computing prominence in collaborative social networks.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2013:STP, author = "Jin Huang and Feiping Nie and Heng Huang and Yi-Cheng Tu and Yu Lei", title = "Social trust prediction using heterogeneous networks", journal = j-TKDD, volume = "7", number = "4", pages = "17:1--17:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541270", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Along with increasing popularity of social websites, online users rely more on the trustworthiness information to make decisions, extract and filter information, and tag and build connections with other users. However, such social network data often suffer from severe data sparsity and are not able to provide users with enough information. Therefore, trust prediction has emerged as an important topic in social network research. Traditional approaches are primarily based on exploring trust graph topology itself. However, research in sociology and our life experience suggest that people who are in the same social circle often exhibit similar behaviors and tastes. To take advantage of the ancillary information for trust prediction, the challenge then becomes what to transfer and how to transfer. In this article, we address this problem by aggregating heterogeneous social networks and propose a novel joint social networks mining (JSNM) method. Our new joint learning model explores the user-group-level similarity between correlated graphs and simultaneously learns the individual graph structure; therefore, the shared structures and patterns from multiple social networks can be utilized to enhance the prediction tasks. As a result, we not only improve the trust prediction in the target graph but also facilitate other information retrieval tasks in the auxiliary graphs. To optimize the proposed objective function, we use the alternative technique to break down the objective function into several manageable subproblems. We further introduce the auxiliary function to solve the optimization problems with rigorously proved convergence. The extensive experiments have been conducted on both synthetic and real- world data. All empirical results demonstrate the effectiveness of our method.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Guzzo:2013:SIF, author = "Antonella Guzzo and Luigi Moccia and Domenico Sacc{\`a} and Edoardo Serra", title = "Solving inverse frequent itemset mining with infrequency constraints via large-scale linear programs", journal = j-TKDD, volume = "7", number = "4", pages = "18:1--18:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541271", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Inverse frequent set mining (IFM) is the problem of computing a transaction database D satisfying given support constraints for some itemsets, which are typically the frequent ones. This article proposes a new formulation of IFM, called IFM$_I$ (IFM with infrequency constraints), where the itemsets that are not listed as frequent are constrained to be infrequent; that is, they must have a support less than or equal to a specified unique threshold. An instance of IFM$_I$ can be seen as an instance of the original IFM by making explicit the infrequency constraints for the minimal infrequent itemsets, corresponding to the so-called negative generator border defined in the literature. The complexity increase from PSPACE (complexity of IFM) to NEXP (complexity of IFM$_I$) is caused by the cardinality of the negative generator border, which can be exponential in the original input size. Therefore, the article introduces a specific problem parameter $ \kappa $ that computes an upper bound to this cardinality using a hypergraph interpretation for which minimal infrequent itemsets correspond to minimal transversals. By fixing a constant k, the article formulates a $k$-bounded definition of the problem, called $k$-IFM$_I$, that collects all instances for which the value of the parameter $ \kappa $ is less than or equal to $k$-its complexity is in PSPACE as for IFM. The bounded problem is encoded as an integer linear program with a large number of variables (actually exponential w.r.t. the number of constraints), which is thereafter approximated by relaxing integer constraints-the decision problem of solving the linear program is proven to be in NP. In order to solve the linear program, a column generation technique is used that is a variation of the simplex method designed to solve large-scale linear programs, in particular with a huge number of variables. The method at each step requires the solution of an auxiliary integer linear program, which is proven to be NP hard in this case and for which a greedy heuristic is presented. The resulting overall column generation solution algorithm enjoys very good scaling as evidenced by the intensive experimentation, thereby paving the way for its application in real-life scenarios.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Balcazar:2013:FCP, author = "Jos{\'e} L. Balc{\'a}zar", title = "Formal and computational properties of the confidence boost of association rules", journal = j-TKDD, volume = "7", number = "4", pages = "19:1--19:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541272", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Some existing notions of redundancy among association rules allow for a logical-style characterization and lead to irredundant bases of absolutely minimum size. We push the intuition of redundancy further to find an intuitive notion of novelty of an association rule, with respect to other rules. Namely, an irredundant rule is so because its confidence is higher than what the rest of the rules would suggest; then, one can ask: how much higher? We propose to measure such a sort of novelty through the confidence boost of a rule. Acting as a complement to confidence and support, the confidence boost helps to obtain small and crisp sets of mined association rules and solves the well-known problem that, in certain cases, rules of negative correlation may pass the confidence bound. We analyze the properties of two versions of the notion of confidence boost, one of them a natural generalization of the other. We develop algorithms to filter rules according to their confidence boost, compare the concept to some similar notions in the literature, and describe the results of some experimentation employing the new notions on standard benchmark datasets. We describe an open source association mining tool that embodies one of our variants of confidence boost in such a way that the data mining process does not require the user to select any value for any parameter.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ang:2013:CPN, author = "Hock Hee Ang and Vivekanand Gopalkrishnan and Steven C. H. Hoi and Wee Keong Ng", title = "Classification in {P2P} networks with cascade support vector machines", journal = j-TKDD, volume = "7", number = "4", pages = "20:1--20:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541273", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Classification in Peer-to-Peer (P2P) networks is important to many real applications, such as distributed intrusion detection, distributed recommendation systems, and distributed antispam detection. However, it is very challenging to perform classification in P2P networks due to many practical issues, such as scalability, peer dynamism, and asynchronism. This article investigates the practical techniques of constructing Support Vector Machine (SVM) classifiers in the P2P networks. In particular, we demonstrate how to efficiently cascade SVM in a P2P network with the use of reduced SVM. In addition, we propose to fuse the concept of cascade SVM with bootstrap aggregation to effectively balance the trade-off between classification accuracy, model construction, and prediction cost. We provide theoretical insights for the proposed solutions and conduct an extensive set of empirical studies on a number of large-scale datasets. Encouraging results validate the efficacy of the proposed approach.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2014:ISI, author = "Wei Chen and Jie Tang", title = "Introduction to special issue on computational aspects of social and information networks: Theory, methodologies, and applications {(TKDD-CASIN)}", journal = j-TKDD, volume = "8", number = "1", pages = "1:1--1:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556608", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yang:2014:USN, author = "Zhi Yang and Christo Wilson and Xiao Wang and Tingting Gao and Ben Y. Zhao and Yafei Dai", title = "Uncovering social network {Sybils} in the wild", journal = j-TKDD, volume = "8", number = "1", pages = "2:1--2:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556609", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Sybil accounts are fake identities created to unfairly increase the power or resources of a single malicious user. Researchers have long known about the existence of Sybil accounts in online communities such as file-sharing systems, but they have not been able to perform large-scale measurements to detect them or measure their activities. In this article, we describe our efforts to detect, characterize, and understand Sybil account activity in the Renren Online Social Network (OSN). We use ground truth provided by Renren Inc. to build measurement-based Sybil detectors and deploy them on Renren to detect more than 100,000 Sybil accounts. Using our full dataset of 650,000 Sybils, we examine several aspects of Sybil behavior. First, we study their link creation behavior and find that contrary to prior conjecture, Sybils in OSNs do not form tight-knit communities. Next, we examine the fine-grained behaviors of Sybils on Renren using clickstream data. Third, we investigate behind-the-scenes collusion between large groups of Sybils. Our results reveal that Sybils with no explicit social ties still act in concert to launch attacks. Finally, we investigate enhanced techniques to identify stealthy Sybils. In summary, our study advances the understanding of Sybil behavior on OSNs and shows that Sybils can effectively avoid existing community-based Sybil detectors. We hope that our results will foster new research on Sybil detection that is based on novel types of Sybil features.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jin:2014:SAR, author = "Ruoming Jin and Victor E. Lee and Longjie Li", title = "Scalable and axiomatic ranking of network role similarity", journal = j-TKDD, volume = "8", number = "1", pages = "3:1--3:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2518176", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A key task in analyzing social networks and other complex networks is role analysis: describing and categorizing nodes according to how they interact with other nodes. Two nodes have the same role if they interact with equivalent sets of neighbors. The most fundamental role equivalence is automorphic equivalence. Unfortunately, the fastest algorithms known for graph automorphism are nonpolynomial. Moreover, since exact equivalence is rare, a more meaningful task is measuring the role similarity between any two nodes. This task is closely related to the structural or link-based similarity problem that SimRank addresses. However, SimRank and other existing similarity measures are not sufficient because they do not guarantee to recognize automorphically or structurally equivalent nodes. This article makes two contributions. First, we present and justify several axiomatic properties necessary for a role similarity measure or metric. Second, we present RoleSim, a new similarity metric that satisfies these axioms and can be computed with a simple iterative algorithm. We rigorously prove that RoleSim satisfies all of these axiomatic properties. We also introduce Iceberg RoleSim, a scalable algorithm that discovers all pairs with RoleSim scores above a user-defined threshold $ \theta $. We demonstrate the interpretative power of RoleSim on both both synthetic and real datasets.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mcauley:2014:DSC, author = "Julian Mcauley and Jure Leskovec", title = "Discovering social circles in ego networks", journal = j-TKDD, volume = "8", number = "1", pages = "4:1--4:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556612", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "People's personal social networks are big and cluttered, and currently there is no good way to automatically organize them. Social networking sites allow users to manually categorize their friends into social circles (e.g., ``circles'' on Google+, and ``lists'' on Facebook and Twitter). However, circles are laborious to construct and must be manually updated whenever a user's network grows. In this article, we study the novel task of automatically identifying users' social circles. We pose this task as a multimembership node clustering problem on a user's ego network, a network of connections between her friends. We develop a model for detecting circles that combines network structure as well as user profile information. For each circle, we learn its members and the circle-specific user profile similarity metric. Modeling node membership to multiple circles allows us to detect overlapping as well as hierarchically nested circles. Experiments show that our model accurately identifies circles on a diverse set of data from Facebook, Google+, and Twitter, for all of which we obtain hand-labeled ground truth.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Abrahao:2014:SFA, author = "Bruno Abrahao and Sucheta Soundarajan and John Hopcroft and Robert Kleinberg", title = "A separability framework for analyzing community structure", journal = j-TKDD, volume = "8", number = "1", pages = "5:1--5:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2527231", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Four major factors govern the intricacies of community extraction in networks: (1) the literature offers a multitude of disparate community detection algorithms whose output exhibits high structural variability across the collection, (2) communities identified by algorithms may differ structurally from real communities that arise in practice, (3) there is no consensus characterizing how to discriminate communities from noncommunities, and (4) the application domain includes a wide variety of networks of fundamentally different natures. In this article, we present a class separability framework to tackle these challenges through a comprehensive analysis of community properties. Our approach enables the assessment of the structural dissimilarity among the output of multiple community detection algorithms and between the output of algorithms and communities that arise in practice. In addition, our method provides us with a way to organize the vast collection of community detection algorithms by grouping those that behave similarly. Finally, we identify the most discriminative graph-theoretical properties of community signature and the small subset of properties that account for most of the biases of the different community detection algorithms. We illustrate our approach with an experimental analysis, which reveals nuances of the structure of real and extracted communities. In our experiments, we furnish our framework with the output of 10 different community detection procedures, representative of categories of popular algorithms available in the literature, applied to a diverse collection of large-scale real network datasets whose domains span biology, online shopping, and social systems. We also analyze communities identified by annotations that accompany the data, which reflect exemplar communities in various domain. We characterize these communities using a broad spectrum of community properties to produce the different structural classes. As our experiments show that community structure is not a universal concept, our framework enables an informed choice of the most suitable community detection method for identifying communities of a specific type in a given network and allows for a comparison of existing community detection algorithms while guiding the design of new ones.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhong:2014:UBL, author = "Erheng Zhong and Wei Fan and Qiang Yang", title = "User behavior learning and transfer in composite social networks", journal = j-TKDD, volume = "8", number = "1", pages = "6:1--6:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556613", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Accurate prediction of user behaviors is important for many social media applications, including social marketing, personalization, and recommendation. A major challenge lies in that although many previous works model user behavior from only historical behavior logs, the available user behavior data or interactions between users and items in a given social network are usually very limited and sparse (e.g., $ \geq 99.9 \% $ empty), which makes models overfit the rare observations and fail to provide accurate predictions. We observe that many people are members of several social networks in the same time, such as Facebook, Twitter, and Tencent's QQ. Importantly, users' behaviors and interests in different networks influence one another. This provides an opportunity to leverage the knowledge of user behaviors in different networks by considering the overlapping users in different networks as bridges, in order to alleviate the data sparsity problem, and enhance the predictive performance of user behavior modeling. Combining different networks ``simply and naively'' does not work well. In this article, we formulate the problem to model multiple networks as ``adaptive composite transfer'' and propose a framework called ComSoc. ComSoc first selects the most suitable networks inside a composite social network via a hierarchical Bayesian model, parameterized for individual users. It then builds topic models for user behavior prediction using both the relationships in the selected networks and related behavior data. With different relational regularization, we introduce different implementations, corresponding to different ways to transfer knowledge from composite social relations. To handle big data, we have implemented the algorithm using Map/Reduce. We demonstrate that the proposed composite network-based user behavior models significantly improve the predictive accuracy over a number of existing approaches on several real-world applications, including a very large social networking dataset from Tencent Inc.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ahmed:2014:NSS, author = "Nesreen K. Ahmed and Jennifer Neville and Ramana Kompella", title = "Network Sampling: From Static to Streaming Graphs", journal = j-TKDD, volume = "8", number = "2", pages = "7:1--7:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601438", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Network sampling is integral to the analysis of social, information, and biological networks. Since many real-world networks are massive in size, continuously evolving, and/or distributed in nature, the network structure is often sampled in order to facilitate study. For these reasons, a more thorough and complete understanding of network sampling is critical to support the field of network science. In this paper, we outline a framework for the general problem of network sampling by highlighting the different objectives, population and units of interest, and classes of network sampling methods. In addition, we propose a spectrum of computational models for network sampling methods, ranging from the traditionally studied model based on the assumption of a static domain to a more challenging model that is appropriate for streaming domains. We design a family of sampling methods based on the concept of graph induction that generalize across the full spectrum of computational models (from static to streaming) while efficiently preserving many of the topological properties of the input graphs. Furthermore, we demonstrate how traditional static sampling algorithms can be modified for graph streams for each of the three main classes of sampling methods: node, edge, and topology-based sampling. Experimental results indicate that our proposed family of sampling methods more accurately preserve the underlying properties of the graph in both static and streaming domains. Finally, we study the impact of network sampling algorithms on the parameter estimation and performance evaluation of relational classification algorithms.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ge:2014:RMA, author = "Yong Ge and Guofei Jiang and Min Ding and Hui Xiong", title = "Ranking Metric Anomaly in Invariant Networks", journal = j-TKDD, volume = "8", number = "2", pages = "8:1--8:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601436", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The management of large-scale distributed information systems relies on the effective use and modeling of monitoring data collected at various points in the distributed information systems. A traditional approach to model monitoring data is to discover invariant relationships among the monitoring data. Indeed, we can discover all invariant relationships among all pairs of monitoring data and generate invariant networks, where a node is a monitoring data source (metric) and a link indicates an invariant relationship between two monitoring data. Such an invariant network representation can help system experts to localize and diagnose the system faults by examining those broken invariant relationships and their related metrics, since system faults usually propagate among the monitoring data and eventually lead to some broken invariant relationships. However, at one time, there are usually a lot of broken links (invariant relationships) within an invariant network. Without proper guidance, it is difficult for system experts to manually inspect this large number of broken links. To this end, in this article, we propose the problem of ranking metrics according to the anomaly levels for a given invariant network, while this is a nontrivial task due to the uncertainties and the complex nature of invariant networks. Specifically, we propose two types of algorithms for ranking metric anomaly by link analysis in invariant networks. Along this line, we first define two measurements to quantify the anomaly level of each metric, and introduce the m Rank algorithm. Also, we provide a weighted score mechanism and develop the g Rank algorithm, which involves an iterative process to obtain a score to measure the anomaly levels. In addition, some extended algorithms based on m Rank and g Rank algorithms are developed by taking into account the probability of being broken as well as noisy links. Finally, we validate all the proposed algorithms on a large number of real-world and synthetic data sets to illustrate the effectiveness and efficiency of different algorithms.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2014:DGP, author = "Gensheng Zhang and Xiao Jiang and Ping Luo and Min Wang and Chengkai Li", title = "Discovering General Prominent Streaks in Sequence Data", journal = j-TKDD, volume = "8", number = "2", pages = "9:1--9:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601439", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article studies the problem of prominent streak discovery in sequence data. Given a sequence of values, a prominent streak is a long consecutive subsequence consisting of only large (small) values, such as consecutive games of outstanding performance in sports, consecutive hours of heavy network traffic, and consecutive days of frequent mentioning of a person in social media. Prominent streak discovery provides insightful data patterns for data analysis in many real-world applications and is an enabling technique for computational journalism. Given its real-world usefulness and complexity, the research on prominent streaks in sequence data opens a spectrum of challenging problems. A baseline approach to finding prominent streaks is a quadratic algorithm that exhaustively enumerates all possible streaks and performs pairwise streak dominance comparison. For more efficient methods, we make the observation that prominent streaks are in fact skyline points in two dimensions-streak interval length and minimum value in the interval. Our solution thus hinges on the idea to separate the two steps in prominent streak discovery: candidate streak generation and skyline operation over candidate streaks. For candidate generation, we propose the concept of local prominent streak (LPS). We prove that prominent streaks are a subset of LPSs and the number of LPSs is less than the length of a data sequence, in comparison with the quadratic number of candidates produced by the brute-force baseline method. We develop efficient algorithms based on the concept of LPS. The nonlinear local prominent streak (NLPS)-based method considers a superset of LPSs as candidates, and the linear local prominent streak (LLPS)-based method further guarantees to consider only LPSs. The proposed properties and algorithms are also extended for discovering general top- k, multisequence, and multidimensional prominent streaks. The results of experiments using multiple real datasets verified the effectiveness of the proposed methods and showed orders of magnitude performance improvement against the baseline method.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Schifanella:2014:MTD, author = "Claudio Schifanella and K. Sel{\c{c}}uk Candan and Maria Luisa Sapino", title = "Multiresolution Tensor Decompositions with Mode Hierarchies", journal = j-TKDD, volume = "8", number = "2", pages = "10:1--10:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2532169", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Tensors (multidimensional arrays) are widely used for representing high-order dimensional data, in applications ranging from social networks, sensor data, and Internet traffic. Multiway data analysis techniques, in particular tensor decompositions, allow extraction of hidden correlations among multiway data and thus are key components of many data analysis frameworks. Intuitively, these algorithms can be thought of as multiway clustering schemes, which consider multiple facets of the data in identifying clusters, their weights, and contributions of each data element. Unfortunately, algorithms for fitting multiway models are, in general, iterative and very time consuming. In this article, we observe that, in many applications, there is a priori background knowledge (or metadata) about one or more domain dimensions. This metadata is often in the form of a hierarchy that clusters the elements of a given data facet (or mode). We investigate whether such single-mode data hierarchies can be used to boost the efficiency of tensor decomposition process, without significant impact on the final decomposition quality. We consider each domain hierarchy as a guide to help provide higher- or lower-resolution views of the data in the tensor on demand and we rely on these metadata-induced multiresolution tensor representations to develop a multiresolution approach to tensor decomposition. In this article, we focus on an alternating least squares (ALS)--based implementation of the two most important decomposition models such as the PARAllel FACtors (PARAFAC, which decomposes a tensor into a diagonal tensor and a set of factor matrices) and the Tucker (which produces as result a core tensor and a set of dimension-subspaces matrices). Experiment results show that, when the available metadata is used as a rough guide, the proposed multiresolution method helps fit both PARAFAC and Tucker models with consistent (under different parameters settings) savings in execution time and memory consumption, while preserving the quality of the decomposition.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2014:RMN, author = "Jin Huang and Feiping Nie and Heng Huang and Chris Ding", title = "Robust Manifold Nonnegative Matrix Factorization", journal = j-TKDD, volume = "8", number = "3", pages = "11:1--11:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601434", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Nonnegative Matrix Factorization (NMF) has been one of the most widely used clustering techniques for exploratory data analysis. However, since each data point enters the objective function with squared residue error, a few outliers with large errors easily dominate the objective function. In this article, we propose a Robust Manifold Nonnegative Matrix Factorization (RMNMF) method using l$_{2, 1}$ -norm and integrating NMF and spectral clustering under the same clustering framework. We also point out the solution uniqueness issue for the existing NMF methods and propose an additional orthonormal constraint to address this problem. With the new constraint, the conventional auxiliary function approach no longer works. We tackle this difficult optimization problem via a novel Augmented Lagrangian Method (ALM)--based algorithm and convert the original constrained optimization problem on one variable into a multivariate constrained problem. The new objective function then can be decomposed into several subproblems that each has a closed-form solution. More importantly, we reveal the connection of our method with robust K -means and spectral clustering, and we demonstrate its theoretical significance. Extensive experiments have been conducted on nine benchmark datasets, and all empirical results show the effectiveness of our method.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2014:RAL, author = "Yu Zhang and Dit-Yan Yeung", title = "A Regularization Approach to Learning Task Relationships in Multitask Learning", journal = j-TKDD, volume = "8", number = "3", pages = "12:1--12:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2538028", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multitask learning is a learning paradigm that seeks to improve the generalization performance of a learning task with the help of some other related tasks. In this article, we propose a regularization approach to learning the relationships between tasks in multitask learning. This approach can be viewed as a novel generalization of the regularized formulation for single-task learning. Besides modeling positive task correlation, our approach-multitask relationship learning (MTRL)-can also describe negative task correlation and identify outlier tasks based on the same underlying principle. By utilizing a matrix-variate normal distribution as a prior on the model parameters of all tasks, our MTRL method has a jointly convex objective function. For efficiency, we use an alternating method to learn the optimal model parameters for each task as well as the relationships between tasks. We study MTRL in the symmetric multitask learning setting and then generalize it to the asymmetric setting as well. We also discuss some variants of the regularization approach to demonstrate the use of other matrix-variate priors for learning task relationships. Moreover, to gain more insight into our model, we also study the relationships between MTRL and some existing multitask learning methods. Experiments conducted on a toy problem as well as several benchmark datasets demonstrate the effectiveness of MTRL as well as its high interpretability revealed by the task covariance matrix.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lin:2014:SCR, author = "Ming Lin and Shifeng Weng and Changshui Zhang", title = "On the Sample Complexity of Random {Fourier} Features for Online Learning: How Many Random {Fourier} Features Do We Need?", journal = j-TKDD, volume = "8", number = "3", pages = "13:1--13:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2611378", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study the sample complexity of random Fourier features for online kernel learning-that is, the number of random Fourier features required to achieve good generalization performance. We show that when the loss function is strongly convex and smooth, online kernel learning with random Fourier features can achieve an $ O (l o g T / T) $ bound for the excess risk with only $ O (1 / \lambda^2)$ random Fourier features, where T is the number of training examples and \lambda is the modulus of strong convexity. This is a significant improvement compared to the existing result for batch kernel learning that requires $ O(T)$ random Fourier features to achieve a generalization bound $ O(1 / \sqrt T)$. Our empirical study verifies that online kernel learning with a limited number of random Fourier features can achieve similar generalization performance as online learning using full kernel matrix. We also present an enhanced online learning algorithm with random Fourier features that improves the classification performance by multiple passes of training examples and a partial average.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Eyal:2014:PIM, author = "Ron Eyal and Avi Rosenfeld and Sigal Sina and Sarit Kraus", title = "Predicting and Identifying Missing Node Information in Social Networks", journal = j-TKDD, volume = "8", number = "3", pages = "14:1--14:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2536775", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:23 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In recent years, social networks have surged in popularity. One key aspect of social network research is identifying important missing information that is not explicitly represented in the network, or is not visible to all. To date, this line of research typically focused on finding the connections that are missing between nodes, a challenge typically termed as the link prediction problem. This article introduces the missing node identification problem, where missing members in the social network structure must be identified. In this problem, indications of missing nodes are assumed to exist. Given these indications and a partial network, we must assess which indications originate from the same missing node and determine the full network structure. Toward solving this problem, we present the missing node identification by spectral clustering algorithm (MISC), an approach based on a spectral clustering algorithm, combined with nodes' pairwise affinity measures that were adopted from link prediction research. We evaluate the performance of our approach in different problem settings and scenarios, using real-life data from Facebook. The results show that our approach has beneficial results and can be effective in solving the missing node identification problem. In addition, this article also presents R-MISC, which uses a sparse matrix representation, efficient algorithms for calculating the nodes' pairwise affinity, and a proprietary dimension reduction technique to enable scaling the MISC algorithm to large networks of more than 100,000 nodes. Last, we consider problem settings where some of the indications are unknown. Two algorithms are suggested for this problem: speculative MISC, based on MISC, and missing link completion, based on classical link prediction literature. We show that speculative MISC outperforms missing link completion.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Webb:2014:EDM, author = "Geoffrey I. Webb and Jilles Vreeken", title = "Efficient Discovery of the Most Interesting Associations", journal = j-TKDD, volume = "8", number = "3", pages = "15:1--15:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601433", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:23 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Self-sufficient itemsets have been proposed as an effective approach to summarizing the key associations in data. However, their computation appears highly demanding, as assessing whether an itemset is self-sufficient requires consideration of all pairwise partitions of the itemset into pairs of subsets as well as consideration of all supersets. This article presents the first published algorithm for efficiently discovering self-sufficient itemsets. This branch-and-bound algorithm deploys two powerful pruning mechanisms based on upper bounds on itemset value and statistical significance level. It demonstrates that finding top- k productive and nonredundant itemsets, with postprocessing to identify those that are not independently productive, can efficiently identify small sets of key associations. We present extensive evaluation of the strengths and limitations of the technique, including comparisons with alternative approaches to finding the most interesting associations.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Shabtai:2014:ODM, author = "Asaf Shabtai and Maya Bercovitch and Lior Rokach and Yuval Elovici", title = "Optimizing Data Misuse Detection", journal = j-TKDD, volume = "8", number = "3", pages = "16:1--16:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2611520", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Data misuse may be performed by entities such as an organization's employees and business partners who are granted access to sensitive information and misuse their privileges. We assume that users can be either trusted or untrusted. The access of untrusted parties to data objects (e.g., client and patient records) should be monitored in an attempt to detect misuse. However, monitoring data objects is resource intensive and time-consuming and may also cause disturbance or inconvenience to the involved employees. Therefore, the monitored data objects should be carefully selected. In this article, we present two optimization problems carefully designed for selecting specific data objects for monitoring, such that the detection rate is maximized and the monitoring effort is minimized. In the first optimization problem, the goal is to select data objects for monitoring that are accessed by at most c trusted agents while ensuring access to at least k monitored objects by each untrusted agent (both c and k are integer variable). As opposed to the first optimization problem, the goal of the second optimization problem is to select monitored data objects that maximize the number of monitored data objects accessed by untrusted agents while ensuring that each trusted agent does not access more than d monitored data objects (d is an integer variable as well). Two efficient heuristic algorithms for solving these optimization problems are proposed, and experiments were conducted simulating different scenarios to evaluate the algorithms' performance. Moreover, we compared the heuristic algorithms' performance to the optimal solution and conducted sensitivity analysis on the three parameters (c, k, and d) and on the ratio between the trusted and untrusted agents.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Hernandez-Orallo:2014:PRC, author = "Jos{\'e} Hern{\'a}ndez-Orallo", title = "Probabilistic Reframing for Cost-Sensitive Regression", journal = j-TKDD, volume = "8", number = "4", pages = "17:1--17:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641758", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:02 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Common-day applications of predictive models usually involve the full use of the available contextual information. When the operating context changes, one may fine-tune the by-default (incontextual) prediction or may even abstain from predicting a value (a reject). Global reframing solutions, where the same function is applied to adapt the estimated outputs to a new cost context, are possible solutions here. An alternative approach, which has not been studied in a comprehensive way for regression in the knowledge discovery and data mining literature, is the use of a local (e.g., probabilistic) reframing approach, where decisions are made according to the estimated output and a reliability, confidence, or probability estimation. In this article, we advocate for a simple two-parameter (mean and variance) approach, working with a normal conditional probability density. Given the conditional mean produced by any regression technique, we develop lightweight ``enrichment'' methods that produce good estimates of the conditional variance, which are used by the probabilistic (local) reframing methods. We apply these methods to some very common families of cost-sensitive problems, such as optimal predictions in (auction) bids, asymmetric loss scenarios, and rejection rules.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Miettinen:2014:MMD, author = "Pauli Miettinen and Jilles Vreeken", title = "{MDL4BMF}: Minimum Description Length for {Boolean} Matrix Factorization", journal = j-TKDD, volume = "8", number = "4", pages = "18:1--18:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601437", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Matrix factorizations-where a given data matrix is approximated by a product of two or more factor matrices-are powerful data mining tools. Among other tasks, matrix factorizations are often used to separate global structure from noise. This, however, requires solving the ``model order selection problem'' of determining the proper rank of the factorization, that is, to answer where fine-grained structure stops, and where noise starts. Boolean Matrix Factorization (BMF)-where data, factors, and matrix product are Boolean-has in recent years received increased attention from the data mining community. The technique has desirable properties, such as high interpretability and natural sparsity. Yet, so far no method for selecting the correct model order for BMF has been available. In this article, we propose the use of the Minimum Description Length (MDL) principle for this task. Besides solving the problem, this well-founded approach has numerous benefits; for example, it is automatic, does not require a likelihood function, is fast, and, as experiments show, is highly accurate. We formulate the description length function for BMF in general-making it applicable for any BMF algorithm. We discuss how to construct an appropriate encoding: starting from a simple and intuitive approach, we arrive at a highly efficient data-to-model--based encoding for BMF. We extend an existing algorithm for BMF to use MDL to identify the best Boolean matrix factorization, analyze the complexity of the problem, and perform an extensive experimental evaluation to study its behavior.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tang:2014:FSS, author = "Jiliang Tang and Huan Liu", title = "Feature Selection for Social Media Data", journal = j-TKDD, volume = "8", number = "4", pages = "19:1--19:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629587", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Feature selection is widely used in preparing high-dimensional data for effective data mining. The explosive popularity of social media produces massive and high-dimensional data at an unprecedented rate, presenting new challenges to feature selection. Social media data consists of (1) traditional high-dimensional, attribute-value data such as posts, tweets, comments, and images, and (2) linked data that provides social context for posts and describes the relationships between social media users as well as who generates the posts, and so on. The nature of social media also determines that its data is massive, noisy, and incomplete, which exacerbates the already challenging problem of feature selection. In this article, we study a novel feature selection problem of selecting features for social media data with its social context. In detail, we illustrate the differences between attribute-value data and social media data, investigate if linked data can be exploited in a new feature selection framework by taking advantage of social science theories. We design and conduct experiments on datasets from real-world social media Web sites, and the empirical results demonstrate that the proposed framework can significantly improve the performance of feature selection. Further experiments are conducted to evaluate the effects of user--user and user--post relationships manifested in linked data on feature selection, and research issues for future work will be discussed.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Riondato:2014:EDA, author = "Matteo Riondato and Eli Upfal", title = "Efficient Discovery of Association Rules and Frequent Itemsets through Sampling with Tight Performance Guarantees", journal = j-TKDD, volume = "8", number = "4", pages = "20:1--20:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629586", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The tasks of extracting (top- K ) Frequent Itemsets (FIs) and Association Rules (ARs) are fundamental primitives in data mining and database applications. Exact algorithms for these problems exist and are widely used, but their running time is hindered by the need of scanning the entire dataset, possibly multiple times. High-quality approximations of FIs and ARs are sufficient for most practical uses. Sampling techniques can be used for fast discovery of approximate solutions, but works exploring this technique did not provide satisfactory performance guarantees on the quality of the approximation due to the difficulty of bounding the probability of under- or oversampling any one of an unknown number of frequent itemsets. We circumvent this issue by applying the statistical concept of Vapnik--Chervonenkis (VC) dimension to develop a novel technique for providing tight bounds on the sample size that guarantees approximation of the (top- K ) FIs and ARs within user-specified parameters. The resulting sample size is linearly dependent on the VC-dimension of a range space associated with the dataset. We analyze the VC-dimension of this range space and show that it is upper bounded by an easy-to-compute characteristic quantity of the dataset, the d-index, namely, the maximum integer d such that the dataset contains at least d transactions of length at least d such that no one of them is a superset of or equal to another. We show that this bound is tight for a large class of datasets. The resulting sample size is a significant improvement over previous known results. We present an extensive experimental evaluation of our technique on real and artificial datasets, demonstrating the practicality of our methods, and showing that they achieve even higher quality approximations than what is guaranteed by the analysis.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Burton:2014:DSC, author = "Scott H. Burton and Christophe G. Giraud-Carrier", title = "Discovering Social Circles in Directed Graphs", journal = j-TKDD, volume = "8", number = "4", pages = "21:1--21:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641759", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:02 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We examine the problem of identifying social circles, or sets of cohesive and mutually aware nodes surrounding an initial query set, in directed graphs where the complete graph is not known beforehand. This problem differs from local community mining, in that the query set defines the circle of interest. We explicitly handle edge direction, as in many cases relationships are not symmetric, and focus on the local context because many real-world graphs cannot be feasibly known. We outline several issues that are unique to this context, introduce a quality function to measure the value of including a particular node in an emerging social circle, and describe a greedy social circle discovery algorithm. We demonstrate the effectiveness of this approach on artificial benchmarks, large networks with topical community labels, and several real-world case studies.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Paul:2014:RPL, author = "Saurabh Paul and Christos Boutsidis and Malik Magdon-Ismail and Petros Drineas", title = "Random Projections for Linear Support Vector Machines", journal = j-TKDD, volume = "8", number = "4", pages = "22:1--22:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641760", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Let $X$ be a data matrix of rank $ \rho $, whose rows represent $n$ points in $d$-dimensional space. The linear support vector machine constructs a hyperplane separator that maximizes the 1-norm soft margin. We develop a new oblivious dimension reduction technique that is precomputed and can be applied to any input matrix $X$. We prove that, with high probability, the margin and minimum enclosing ball in the feature space are preserved to within $ \epsilon $-relative error, ensuring comparable generalization as in the original space in the case of classification. For regression, we show that the margin is preserved to $ \epsilon $-relative error with high probability. We present extensive experiments with real and synthetic data to support our theory.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Erdo:2014:RGN, author = "D{\'o}ra Erd{\H{o}}s and Rainer Gemulla and Evimaria Terzi", title = "Reconstructing Graphs from Neighborhood Data", journal = j-TKDD, volume = "8", number = "4", pages = "23:1--23:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641761", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:02 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Consider a social network and suppose that we are only given the number of common friends between each pair of users. Can we reconstruct the underlying network? Similarly, consider a set of documents and the words that appear in them. If we only know the number of common words for every pair of documents, as well as the number of common documents for every pair of words, can we infer which words appear in which documents? In this article, we develop a general methodology for answering questions like these. We formalize these questions in what we call the {\em R}econstruct problem: given information about the common neighbors of nodes in a network, our goal is to reconstruct the hidden binary matrix that indicates the presence or absence of relationships between individual nodes. In fact, we propose two different variants of this problem: one where the number of connections of every node (i.e., the degree of every node) is known and a second one where it is unknown. We call these variants the degree-aware and the degree-oblivious versions of the Reconstruct problem, respectively. Our algorithms for both variants exploit the properties of the singular value decomposition of the hidden binary matrix. More specifically, we show that using the available neighborhood information, we can reconstruct the hidden matrix by finding the components of its singular value decomposition and then combining them appropriately. Our extensive experimental study suggests that our methods are able to reconstruct binary matrices of different characteristics with up to 100\% accuracy.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Acharya:2014:OFC, author = "Ayan Acharya and Eduardo R. Hruschka and Joydeep Ghosh and Sreangsu Acharyya", title = "An Optimization Framework for Combining Ensembles of Classifiers and Clusterers with Applications to Nontransductive Semisupervised Learning and Transfer Learning", journal = j-TKDD, volume = "9", number = "1", pages = "1:1--1:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601435", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Unsupervised models can provide supplementary soft constraints to help classify new ``target'' data because similar instances in the target set are more likely to share the same class label. Such models can also help detect possible differences between training and target distributions, which is useful in applications where concept drift may take place, as in transfer learning settings. This article describes a general optimization framework that takes as input class membership estimates from existing classifiers learned on previously encountered ``source'' (or training) data, as well as a similarity matrix from a cluster ensemble operating solely on the target (or test) data to be classified, and yields a consensus labeling of the target data. More precisely, the application settings considered are nontransductive semisupervised and transfer learning scenarios where the training data are used only to build an ensemble of classifiers and are subsequently discarded before classifying the target data. The framework admits a wide range of loss functions and classification/clustering methods. It exploits properties of Bregman divergences in conjunction with Legendre duality to yield a principled and scalable approach. A variety of experiments show that the proposed framework can yield results substantially superior to those provided by na{\"\i}vely applying classifiers learned on the original task to the target data. In addition, we show that the proposed approach, even not being conceptually transductive, can provide better results compared to some popular transductive learning techniques.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Boedihardjo:2014:FEL, author = "Arnold P. Boedihardjo and Chang-Tien Lu and Bingsheng Wang", title = "A Framework for Exploiting Local Information to Enhance Density Estimation of Data Streams", journal = j-TKDD, volume = "9", number = "1", pages = "2:1--2:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629618", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The Probability Density Function (PDF) is the fundamental data model for a variety of stream mining algorithms. Existing works apply the standard nonparametric Kernel Density Estimator (KDE) to approximate the PDF of data streams. As a result, the stream-based KDEs cannot accurately capture complex local density features. In this article, we propose the use of Local Region (LRs) to model local density information in univariate data streams. In-depth theoretical analyses are presented to justify the effectiveness of the LR-based KDE. Based on the analyses, we develop the General Local rEgion AlgorithM (GLEAM) to enhance the estimation quality of structurally complex univariate distributions for existing stream-based KDEs. A set of algorithmic optimizations is designed to improve the query throughput of GLEAM and to achieve its linear order computation. Additionally, a comprehensive suite of experiments was conducted to test the effectiveness and efficiency of GLEAM.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ordonez:2014:BVS, author = "Carlos Ordonez and Carlos Garcia-Alvarado and Veerabhadaran Baladandayuthapani", title = "{Bayesian} Variable Selection in Linear Regression in One Pass for Large Datasets", journal = j-TKDD, volume = "9", number = "1", pages = "3:1--3:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629617", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Bayesian models are generally computed with Markov Chain Monte Carlo (MCMC) methods. The main disadvantage of MCMC methods is the large number of iterations they need to sample the posterior distributions of model parameters, especially for large datasets. On the other hand, variable selection remains a challenging problem due to its combinatorial search space, where Bayesian models are a promising solution. In this work, we study how to accelerate Bayesian model computation for variable selection in linear regression. We propose a fast Gibbs sampler algorithm, a widely used MCMC method that incorporates several optimizations. We use a Zellner prior for the regression coefficients, an improper prior on variance, and a conjugate prior Gaussian distribution, which enable dataset summarization in one pass, thus exploiting an augmented set of sufficient statistics. Thereafter, the algorithm iterates in main memory. Sufficient statistics are indexed with a sparse binary vector to efficiently compute matrix projections based on selected variables. Discovered variable subsets probabilities, selecting and discarding each variable, are stored on a hash table for fast retrieval in future iterations. We study how to integrate our algorithm into a Database Management System (DBMS), exploiting aggregate User-Defined Functions for parallel data summarization and stored procedures to manipulate matrices with arrays. An experimental evaluation with real datasets evaluates accuracy and time performance, comparing our DBMS-based algorithm with the R package. Our algorithm is shown to produce accurate results, scale linearly on dataset size, and run orders of magnitude faster than the R package.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Fei:2014:SSB, author = "Hongliang Fei and Jun Huan", title = "Structured Sparse Boosting for Graph Classification", journal = j-TKDD, volume = "9", number = "1", pages = "4:1--4:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629328", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Boosting is a highly effective algorithm that produces a linear combination of weak classifiers (a.k.a. base learners) to obtain high-quality classification models. In this article, we propose a generalized logit boost algorithm in which base learners have structural relationships in the functional space. Although such relationships are generic, our work is particularly motivated by the emerging topic of pattern-based classification for semistructured data including graphs. Toward an efficient incorporation of the structure information, we have designed a general model in which we use an undirected graph to capture the relationship of subgraph-based base learners. In our method, we employ both L$_1$ and Laplacian-based L$_2$ regularization to logit boosting to achieve model sparsity and smoothness in the functional space spanned by the base learners. We have derived efficient optimization algorithms based on coordinate descent for the new boosting formulation and theoretically prove that it exhibits a natural grouping effect for nearby spatial or overlapping base learners and that the resulting estimator is consistent. Additionally, motivated by the connection between logit boosting and logistic regression, we extend our structured sparse regularization framework to logistic regression for vectorial data in which features are structured. Using comprehensive experimental study and comparing our work with the state-of-the-art, we have demonstrated the effectiveness of the proposed learning method.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xu:2014:GGB, author = "Zhiqiang Xu and Yiping Ke and Yi Wang and Hong Cheng and James Cheng", title = "{GBAGC}: a General {Bayesian} Framework for Attributed Graph Clustering", journal = j-TKDD, volume = "9", number = "1", pages = "5:1--5:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629616", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Graph clustering, also known as community detection, is a long-standing problem in data mining. In recent years, with the proliferation of rich attribute information available for objects in real-world graphs, how to leverage not only structural but also attribute information for clustering attributed graphs becomes a new challenge. Most existing works took a distance-based approach. They proposed various distance measures to fuse structural and attribute information and then applied standard techniques for graph clustering based on these distance measures. In this article, we take an alternative view and propose a novel Bayesian framework for attributed graph clustering. Our framework provides a general and principled solution to modeling both the structural and the attribute aspects of a graph. It avoids the artificial design of a distance measure in existing methods and, furthermore, can seamlessly handle graphs with different types of edges and vertex attributes. We develop an efficient variational method for graph clustering under this framework and derive two concrete algorithms for clustering unweighted and weighted attributed graphs. Experimental results on large real-world datasets show that our algorithms significantly outperform the state-of-the-art distance-based method, in terms of both effectiveness and efficiency.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Coscia:2014:UHO, author = "Michele Coscia and Giulio Rossetti and Fosca Giannotti and Dino Pedreschi", title = "Uncovering Hierarchical and Overlapping Communities with a Local-First Approach", journal = j-TKDD, volume = "9", number = "1", pages = "6:1--6:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629511", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Community discovery in complex networks is the task of organizing a network's structure by grouping together nodes related to each other. Traditional approaches are based on the assumption that there is a global-level organization in the network. However, in many scenarios, each node is the bearer of complex information and cannot be classified in disjoint clusters. The top-down global view of the partition approach is not designed for this. Here, we represent this complex information as multiple latent labels, and we postulate that edges in the networks are created among nodes carrying similar labels. The latent labels are the communities a node belongs to and we discover them with a simple local-first approach to community discovery. This is achieved by democratically letting each node vote for the communities it sees surrounding it in its limited view of the global system, its ego neighborhood, using a label propagation algorithm, assuming that each node is aware of the label it shares with each of its connections. The local communities are merged hierarchically, unveiling the modular organization of the network at the global level and identifying overlapping groups and groups of groups. We tested this intuition against the state-of-the-art overlapping community discovery and found that our new method advances in the chosen scenarios in the quality of the obtained communities. We perform a test on benchmark and on real-world networks, evaluating the quality of the community coverage by using the extracted communities to predict the metadata attached to the nodes, which we consider external information about the latent labels. We also provide an explanation about why real-world networks contain overlapping communities and how our logic is able to capture them. Finally, we show how our method is deterministic, is incremental, and has a limited time complexity, so that it can be used on real-world scale networks.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2014:GML, author = "Guangtao Wang and Qinbao Song and Xueying Zhang and Kaiyuan Zhang", title = "A Generic Multilabel Learning-Based Classification Algorithm Recommendation Method", journal = j-TKDD, volume = "9", number = "1", pages = "7:1--7:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629474", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Oct 10 17:19:10 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "As more and more classification algorithms continue to be developed, recommending appropriate algorithms to a given classification problem is increasingly important. This article first distinguishes the algorithm recommendation methods by two dimensions: (1) meta-features, which are a set of measures used to characterize the learning problems, and (2) meta-target, which represents the relative performance of the classification algorithms on the learning problem. In contrast to the existing algorithm recommendation methods whose meta-target is usually in the form of either the ranking of candidate algorithms or a single algorithm, this article proposes a new and natural multilabel form to describe the meta-target. This is due to the fact that there would be multiple algorithms being appropriate for a given problem in practice. Furthermore, a novel multilabel learning-based generic algorithm recommendation method is proposed, which views the algorithm recommendation as a multilabel learning problem and solves the problem by the mature multilabel learning algorithms. To evaluate the proposed multilabel learning-based recommendation method, extensive experiments with 13 well-known classification algorithms, two kinds of meta-targets such as algorithm ranking and single algorithm, and five different kinds of meta-features are conducted on 1,090 benchmark learning problems. The results show the effectiveness of our proposed multilabel learning-based recommendation method.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2014:EEM, author = "Pinghui Wang and John C. S. Lui and Bruno Ribeiro and Don Towsley and Junzhou Zhao and Xiaohong Guan", title = "Efficiently Estimating Motif Statistics of Large Networks", journal = j-TKDD, volume = "9", number = "2", pages = "8:1--8:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629564", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Exploring statistics of locally connected subgraph patterns (also known as network motifs) has helped researchers better understand the structure and function of biological and Online Social Networks (OSNs). Nowadays, the massive size of some critical networks-often stored in already overloaded relational databases-effectively limits the rate at which nodes and edges can be explored, making it a challenge to accurately discover subgraph statistics. In this work, we propose sampling methods to accurately estimate subgraph statistics from as few queried nodes as possible. We present sampling algorithms that efficiently and accurately estimate subgraph properties of massive networks. Our algorithms require no precomputation or complete network topology information. At the same time, we provide theoretical guarantees of convergence. We perform experiments using widely known datasets and show that, for the same accuracy, our algorithms require an order of magnitude less queries (samples) than the current state-of-the-art algorithms.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zheng:2014:FHE, author = "Li Zheng and Tao Li and Chris Ding", title = "A Framework for Hierarchical Ensemble Clustering", journal = j-TKDD, volume = "9", number = "2", pages = "9:1--9:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2611380", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Ensemble clustering, as an important extension of the clustering problem, refers to the problem of combining different (input) clusterings of a given dataset to generate a final (consensus) clustering that is a better fit in some sense than existing clusterings. Over the past few years, many ensemble clustering approaches have been developed. However, most of them are designed for partitional clustering methods, and few research efforts have been reported for ensemble hierarchical clustering methods. In this article, a hierarchical ensemble clustering framework that can naturally combine both partitional clustering and hierarchical clustering results is proposed. In addition, a novel method for learning the ultra-metric distance from the aggregated distance matrices and generating final hierarchical clustering with enhanced cluster separation is developed based on the ultra-metric distance for hierarchical clustering. We study three important problems: dendrogram description, dendrogram combination, and dendrogram selection. We develop two approaches for dendrogram selection based on tree distances, and we investigate various dendrogram distances for representing dendrograms. We provide a systematic empirical study of the ensemble hierarchical clustering problem. Experimental results demonstrate the effectiveness of our proposed approaches.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huai:2014:TPC, author = "Baoxing Huai and Enhong Chen and Hengshu Zhu and Hui Xiong and Tengfei Bao and Qi Liu and Jilei Tian", title = "Toward Personalized Context Recognition for Mobile Users: a Semisupervised {Bayesian} {HMM} Approach", journal = j-TKDD, volume = "9", number = "2", pages = "10:1--10:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629504", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The problem of mobile context recognition targets the identification of semantic meaning of context in a mobile environment. This plays an important role in understanding mobile user behaviors and thus provides the opportunity for the development of better intelligent context-aware services. A key step of context recognition is to model the personalized contextual information of mobile users. Although many studies have been devoted to mobile context modeling, limited efforts have been made on the exploitation of the sequential and dependency characteristics of mobile contextual information. Also, the latent semantics behind mobile context are often ambiguous and poorly understood. Indeed, a promising direction is to incorporate some domain knowledge of common contexts, such as ``waiting for a bus'' or ``having dinner,'' by modeling both labeled and unlabeled context data from mobile users because there are often few labeled contexts available in practice. To this end, in this article, we propose a sequence-based semisupervised approach to modeling personalized context for mobile users. Specifically, we first exploit the Bayesian Hidden Markov Model (B-HMM) for modeling context in the form of probabilistic distributions and transitions of raw context data. Also, we propose a sequential model by extending B-HMM with the prior knowledge of contextual features to model context more accurately. Then, to efficiently learn the parameters and initial values of the proposed models, we develop a novel approach for parameter estimation by integrating the Dirichlet Process Mixture (DPM) model and the Mixture Unigram (MU) model. Furthermore, by incorporating both user-labeled and unlabeled data, we propose a semisupervised learning-based algorithm to identify and model the latent semantics of context. Finally, experimental results on real-world data clearly validate both the efficiency and effectiveness of the proposed approaches for recognizing personalized context of mobile users.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2014:ADI, author = "Siyuan Liu and Lei Chen and Lionel M. Ni", title = "Anomaly Detection from Incomplete Data", journal = j-TKDD, volume = "9", number = "2", pages = "11:1--11:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629668", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Anomaly detection (a.k.a., outlier or burst detection) is a well-motivated problem and a major data mining and knowledge discovery task. In this article, we study the problem of population anomaly detection, one of the key issues related to event monitoring and population management within a city. Through studying detected population anomalies, we can trace and analyze these anomalies, which could help to model city traffic design and event impact analysis and prediction. Although a significant and interesting issue, it is very hard to detect population anomalies and retrieve anomaly trajectories, especially given that it is difficult to get actual and sufficient population data. To address the difficulties of a lack of real population data, we take advantage of mobile phone networks, which offer enormous spatial and temporal communication data on persons. More importantly, we claim that we can utilize these mobile phone data to infer and approximate population data. Thus, we can study the population anomaly detection problem by taking advantages of unique features hidden in mobile phone data. In this article, we present a system to conduct Population Anomaly Detection (PAD). First, we propose an effective clustering method, correlation-based clustering, to cluster the incomplete location information from mobile phone data (i.e., from mobile call volume distribution to population density distribution). Then, we design an adaptive parameter-free detection method, R-scan, to capture the distributed dynamic anomalies. Finally, we devise an efficient algorithm, BT-miner, to retrieve anomaly trajectories. The experimental results from real-life mobile phone data confirm the effectiveness and efficiency of the proposed algorithms. Finally, the proposed methods are realized as a pilot system in a city in China.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gundecha:2014:UVR, author = "Pritam Gundecha and Geoffrey Barbier and Jiliang Tang and Huan Liu", title = "User Vulnerability and Its Reduction on a Social Networking Site", journal = j-TKDD, volume = "9", number = "2", pages = "12:1--12:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2630421", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Privacy and security are major concerns for many users of social media. When users share information (e.g., data and photos) with friends, they can make their friends vulnerable to security and privacy breaches with dire consequences. With the continuous expansion of a user's social network, privacy settings alone are often inadequate to protect a user's profile. In this research, we aim to address some critical issues related to privacy protection: (1) How can we measure and assess individual users' vulnerability? (2) With the diversity of one's social network friends, how can one figure out an effective approach to maintaining balance between vulnerability and social utility? In this work, first we present a novel way to define vulnerable friends from an individual user's perspective. User vulnerability is dependent on whether or not the user's friends' privacy settings protect the friend and the individual's network of friends (which includes the user). We show that it is feasible to measure and assess user vulnerability and reduce one's vulnerability without changing the structure of a social networking site. The approach is to unfriend one's most vulnerable friends. However, when such a vulnerable friend is also socially important, unfriending him or her would significantly reduce one's own social status. We formulate this novel problem as vulnerability minimization with social utility constraints. We formally define the optimization problem and provide an approximation algorithm with a proven bound. Finally, we conduct a large-scale evaluation of a new framework using a Facebook dataset. We resort to experiments and observe how much vulnerability an individual user can be decreased by unfriending a vulnerable friend. We compare performance of different unfriending strategies and discuss the security risk of new friend requests. Additionally, by employing different forms of social utility, we confirm that the balance between user vulnerability and social utility can be practically achieved.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Duan:2014:SRC, author = "Lian Duan and W. Nick Street and Yanchi Liu and Songhua Xu and Brook Wu", title = "Selecting the Right Correlation Measure for Binary Data", journal = j-TKDD, volume = "9", number = "2", pages = "13:1--13:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2637484", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Finding the most interesting correlations among items is essential for problems in many commercial, medical, and scientific domains. Although there are numerous measures available for evaluating correlations, different correlation measures provide drastically different results. Piatetsky-Shapiro provided three mandatory properties for any reasonable correlation measure, and Tan et al. proposed several properties to categorize correlation measures; however, it is still hard for users to choose the desirable correlation measures according to their needs. In order to solve this problem, we explore the effectiveness problem in three ways. First, we propose two desirable properties and two optional properties for correlation measure selection and study the property satisfaction for different correlation measures. Second, we study different techniques to adjust correlation measures and propose two new correlation measures: the Simplified $ \chi^2 $ with Continuity Correction and the Simplified $ \chi^2 $ with Support. Third, we analyze the upper and lower bounds of different measures and categorize them by the bound differences. Combining these three directions, we provide guidelines for users to choose the proper measure according to their needs.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2014:PBA, author = "Hao Huang and Hong Qin and Shinjae Yoo and Dantong Yu", title = "Physics-Based Anomaly Detection Defined on Manifold Space", journal = j-TKDD, volume = "9", number = "2", pages = "14:1--14:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641574", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Current popular anomaly detection algorithms are capable of detecting global anomalies but often fail to distinguish local anomalies from normal instances. Inspired by contemporary physics theory (i.e., heat diffusion and quantum mechanics), we propose two unsupervised anomaly detection algorithms. Building on the embedding manifold derived from heat diffusion, we devise Local Anomaly Descriptor (LAD), which faithfully reveals the intrinsic neighborhood density. It uses a scale-dependent umbrella operator to bridge global and local properties, which makes LAD more informative within an adaptive scope of neighborhood. To offer more stability of local density measurement on scaling parameter tuning, we formulate Fermi Density Descriptor (FDD), which measures the probability of a fermion particle being at a specific location. By choosing the stable energy distribution function, FDD steadily distinguishes anomalies from normal instances with any scaling parameter setting. To further enhance the efficacy of our proposed algorithms, we explore the utility of anisotropic Gaussian kernel (AGK), which offers better manifold-aware affinity information. We also quantify and examine the effect of different Laplacian normalizations for anomaly detection. Comprehensive experiments on both synthetic and benchmark datasets verify that our proposed algorithms outperform the existing anomaly detection algorithms.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gionis:2015:ISI, author = "Aristides Gionis and Hang Li", title = "Introduction to the Special Issue {{ACM} {SIGKDD}} 2013", journal = j-TKDD, volume = "9", number = "3", pages = "15:1--15:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700993", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15e", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jha:2015:SES, author = "Madhav Jha and C. Seshadhri and Ali Pinar", title = "A Space-Efficient Streaming Algorithm for Estimating Transitivity and Triangle Counts Using the Birthday Paradox", journal = j-TKDD, volume = "9", number = "3", pages = "15:1--15:??", month = feb, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700395", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 6 09:34:37 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We design a space-efficient algorithm that approximates the transitivity (global clustering coefficient) and total triangle count with only a single pass through a graph given as a stream of edges. Our procedure is based on the classic probabilistic result, the birthday paradox. When the transitivity is constant and there are more edges than wedges (common properties for social networks), we can prove that our algorithm requires $O( \sqrt n )$ space ($n$ is the number of vertices) to provide accurate estimates. We run a detailed set of experiments on a variety of real graphs and demonstrate that the memory requirement of the algorithm is a tiny fraction of the graph. For example, even for a graph with 200 million edges, our algorithm stores just 40,000 edges to give accurate results. Being a single pass streaming algorithm, our procedure also maintains a real-time estimate of the transitivity/number of triangles of a graph by storing a minuscule fraction of edges.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tang:2015:FMT, author = "Lu-An Tang and Xiao Yu and Quanquan Gu and Jiawei Han and Guofei Jiang and Alice Leung and Thomas {La Porta}", title = "A Framework of Mining Trajectories from Untrustworthy Data in Cyber-Physical System", journal = j-TKDD, volume = "9", number = "3", pages = "16:1--16:??", month = feb, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700394", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 6 09:34:37 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A cyber-physical system (CPS) integrates physical (i.e., sensor) devices with cyber (i.e., informational) components to form a context-sensitive system that responds intelligently to dynamic changes in real-world situations. The CPS has wide applications in scenarios such as environment monitoring, battlefield surveillance, and traffic control. One key research problem of CPS is called mining lines in the sand. With a large number of sensors (sand) deployed in a designated area, the CPS is required to discover all trajectories (lines) of passing intruders in real time. There are two crucial challenges that need to be addressed: (1) the collected sensor data are not trustworthy, and (2) the intruders do not send out any identification information. The system needs to distinguish multiple intruders and track their movements. This study proposes a method called LiSM (Line-in-the-Sand Miner) to discover trajectories from untrustworthy sensor data. LiSM constructs a watching network from sensor data and computes the locations of intruder appearances based on the link information of the network. The system retrieves a cone model from the historical trajectories to track multiple intruders. Finally, the system validates the mining results and updates sensors' reliability scores in a feedback process. In addition, LoRM (Line-on-the-Road Miner) is proposed for trajectory discovery on road networks- mining lines on the roads. LoRM employs a filtering-and-refinement framework to reduce the distance computational overhead on road networks and uses a shortest-path-measure to track intruders. The proposed methods are evaluated with extensive experiments on big datasets. The experimental results show that the proposed methods achieve higher accuracy and efficiency in trajectory mining tasks.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2015:QDR, author = "Zheng Wang and Jieping Ye", title = "Querying Discriminative and Representative Samples for Batch Mode Active Learning", journal = j-TKDD, volume = "9", number = "3", pages = "17:1--17:??", month = feb, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700408", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 6 09:34:37 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Empirical risk minimization (ERM) provides a principled guideline for many machine learning and data mining algorithms. Under the ERM principle, one minimizes an upper bound of the true risk, which is approximated by the summation of empirical risk and the complexity of the candidate classifier class. To guarantee a satisfactory learning performance, ERM requires that the training data are i.i.d. sampled from the unknown source distribution. However, this may not be the case in active learning, where one selects the most informative samples to label, and these data may not follow the source distribution. In this article, we generalize the ERM principle to the active learning setting. We derive a novel form of upper bound for the true risk in the active learning setting; by minimizing this upper bound, we develop a practical batch mode active learning method. The proposed formulation involves a nonconvex integer programming optimization problem. We solve it efficiently by an alternating optimization method. Our method is shown to query the most informative samples while preserving the source distribution as much as possible, thus identifying the most uncertain and representative queries. We further extend our method to multiclass active learning by introducing novel pseudolabels in the multiclass case and developing an efficient algorithm. Experiments on benchmark datasets and real-world applications demonstrate the superior performance of our proposed method compared to state-of-the-art methods.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gopal:2015:HBI, author = "Siddharth Gopal and Yiming Yang", title = "Hierarchical {Bayesian} Inference and Recursive Regularization for Large-Scale Classification", journal = j-TKDD, volume = "9", number = "3", pages = "18:1--18:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629585", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article, we address open challenges in large-scale classification, focusing on how to effectively leverage the dependency structures (hierarchical or graphical) among class labels, and how to make the inference scalable in jointly optimizing all model parameters. We propose two main approaches, namely the hierarchical Bayesian inference framework and the recursive regularization scheme. The key idea in both approaches is to reinforce the similarity among parameter across the nodes in a hierarchy or network based on the proximity and connectivity of the nodes. For scalability, we develop hierarchical variational inference algorithms and fast dual coordinate descent training procedures with parallelization. In our experiments for classification problems with hundreds of thousands of classes and millions of training instances with terabytes of parameters, the proposed methods show consistent and statistically significant improvements over other competing approaches, and the best results on multiple benchmark datasets for large-scale classification.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yin:2015:MLB, author = "Hongzhi Yin and Bin Cui and Ling Chen and Zhiting Hu and Chengqi Zhang", title = "Modeling Location-Based User Rating Profiles for Personalized Recommendation", journal = j-TKDD, volume = "9", number = "3", pages = "19:1--19:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2663356", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article proposes LA-LDA, a location-aware probabilistic generative model that exploits location-based ratings to model user profiles and produce recommendations. Most of the existing recommendation models do not consider the spatial information of users or items; however, LA-LDA supports three classes of location-based ratings, namely spatial user ratings for nonspatial items, nonspatial user ratings for spatial items, and spatial user ratings for spatial items. LA-LDA consists of two components, ULA-LDA and ILA-LDA, which are designed to take into account user and item location information, respectively. The component ULA-LDA explicitly incorporates and quantifies the influence from local public preferences to produce recommendations by considering user home locations, whereas the component ILA-LDA recommends items that are closer in both taste and travel distance to the querying users by capturing item co-occurrence patterns, as well as item location co-occurrence patterns. The two components of LA-LDA can be applied either separately or collectively, depending on the available types of location-based ratings. To demonstrate the applicability and flexibility of the LA-LDA model, we deploy it to both top- k recommendation and cold start recommendation scenarios. Experimental evidence on large-scale real-world data, including the data from Gowalla (a location-based social network), DoubanEvent (an event-based social network), and MovieLens (a movie recommendation system), reveal that LA-LDA models user profiles more accurately by outperforming existing recommendation models for top- k recommendation and the cold start problem.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Hu:2015:PSD, author = "Juhua Hu and De-Chuan Zhan and Xintao Wu and Yuan Jiang and Zhi-Hua Zhou", title = "Pairwised Specific Distance Learning from Physical Linkages", journal = j-TKDD, volume = "9", number = "3", pages = "20:1--20:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700405", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In real tasks, usually a good classification performance can only be obtained when a good distance metric is obtained; therefore, distance metric learning has attracted significant attention in the past few years. Typical studies of distance metric learning evaluate how to construct an appropriate distance metric that is able to separate training data points from different classes or satisfy a set of constraints (e.g., must-links and/or cannot-links). It is noteworthy that this task becomes challenging when there are only limited labeled training data points and no constraints are given explicitly. Moreover, most existing approaches aim to construct a global distance metric that is applicable to all data points. However, different data points may have different properties and may require different distance metrics. We notice that data points in real tasks are often connected by physical links (e.g., people are linked with each other in social networks; personal webpages are often connected to other webpages, including nonpersonal webpages), but the linkage information has not been exploited in distance metric learning. In this article, we develop a pairwised specific distance (PSD) approach that exploits the structures of physical linkages and in particular captures the key observations that nonmetric and clique linkages imply the appearance of different or unique semantics, respectively. It is noteworthy that, rather than generating a global distance, PSD generates different distances for different pairs of data points; this property is desired in applications involving complicated data semantics. We mainly present PSD for multi-class learning and further extend it to multi-label learning. Experimental results validate the effectiveness of PSD, especially in the scenarios in which there are very limited labeled training data points and no explicit constraints are given.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Soundarajan:2015:ULG, author = "Sucheta Soundarajan and John E. Hopcroft", title = "Use of Local Group Information to Identify Communities in Networks", journal = j-TKDD, volume = "9", number = "3", pages = "21:1--21:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700404", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The recent interest in networks has inspired a broad range of work on algorithms and techniques to characterize, identify, and extract communities from networks. Such efforts are complicated by a lack of consensus on what a ``community'' truly is, and these disagreements have led to a wide variety of mathematical formulations for describing communities. Often, these mathematical formulations, such as modularity and conductance, have been founded in the general principle that communities, like a G ( n, p ) graph, are ``round,'' with connections throughout the entire community, and so algorithms were developed to optimize such mathematical measures. More recently, a variety of algorithms have been developed that, rather than expecting connectivity through the entire community, seek out very small groups of well-connected nodes and then connect these groups into larger communities. In this article, we examine seven real networks, each containing external annotation that allows us to identify ``annotated communities.'' A study of these annotated communities gives insight into why the second category of community detection algorithms may be more successful than the first category. We then present a flexible algorithm template that is based on the idea of joining together small sets of nodes. In this template, we first identify very small, tightly connected ``subcommunities'' of nodes, each corresponding to a single node's ``perception'' of the network around it. We then create a new network in which each node represents such a subcommunity, and then identify communities in this new network. Because each node can appear in multiple subcommunities, this method allows us to detect overlapping communities. When evaluated on real data, we show that our template outperforms many other state-of-the-art algorithms.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2015:UCN, author = "Pinghui Wang and Junzhou Zhao and John C. S. Lui and Don Towsley and Xiaohong Guan", title = "Unbiased Characterization of Node Pairs over Large Graphs", journal = j-TKDD, volume = "9", number = "3", pages = "22:1--22:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700393", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Characterizing user pair relationships is important for applications such as friend recommendation and interest targeting in online social networks (OSNs). Due to the large-scale nature of such networks, it is infeasible to enumerate all user pairs and thus sampling is used. In this article, we show that it is a great challenge for OSN service providers to characterize user pair relationships, even when they possess the complete graph topology. The reason is that when sampling techniques (i.e., uniform vertex sampling (UVS) and random walk (RW)) are naively applied, they can introduce large biases, particularly for estimating similarity distribution of user pairs with constraints like existence of mutual neighbors, which is important for applications such as identifying network homophily. Estimating statistics of user pairs is more challenging in the absence of the complete topology information, as an unbiased sampling technique like UVS is usually not allowed and exploring the OSN graph topology is expensive. To address these challenges, we present unbiased sampling methods to characterize user pair properties based on UVS and RW techniques. We carry out an evaluation of our methods to show their accuracy and efficiency. Finally, we apply our methods to three OSNs-Foursquare, Douban, and Xiami-and discover that significant homophily is present in these networks.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Vlachos:2015:DPC, author = "Michail Vlachos and Johannes Schneider and Vassilios G. Vassiliadis", title = "On Data Publishing with Clustering Preservation", journal = j-TKDD, volume = "9", number = "3", pages = "23:1--23:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700403", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The emergence of cloud-based storage services is opening up new avenues in data exchange and data dissemination. This has amplified the interest in right-protection mechanisms to establish ownership in the event of data leakage. Current right-protection technologies, however, rarely provide strong guarantees on dataset utility after the protection process. This work presents techniques that explicitly address this topic and provably preserve the outcome of certain mining operations. In particular, we take special care to guarantee that the outcome of hierarchical clustering operations remains the same before and after right protection. Our approach considers all prevalent hierarchical clustering variants: single-, complete-, and average-linkage. We imprint the ownership in a dataset using watermarking principles, and we derive tight bounds on the expansion/contraction of distances incurred by the process. We leverage our analysis to design fast algorithms for right protection without exhaustively searching the vast design space. Finally, because the right-protection process introduces a user-tunable distortion on the dataset, we explore the possibility of using this mechanism for data obfuscation. We quantify the tradeoff between obfuscation and utility for spatiotemporal datasets and discover very favorable characteristics of the process. An additional advantage is that when one is interested in both right-protecting and obfuscating the original data values, the proposed mechanism can accomplish both tasks simultaneously.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{VazDeMelo:2015:UDP, author = "Pedro O. S. {Vaz De Melo} and Christos Faloutsos and Renato Assun{\c{c}}{\~a}o and Rodrigo Alves and Antonio A. F. Loureiro", title = "Universal and Distinct Properties of Communication Dynamics: How to Generate Realistic Inter-event Times", journal = j-TKDD, volume = "9", number = "3", pages = "24:1--24:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700399", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "With the advancement of information systems, means of communications are becoming cheaper, faster, and more available. Today, millions of people carrying smartphones or tablets are able to communicate practically any time and anywhere they want. They can access their e-mails, comment on weblogs, watch and post videos and photos (as well as comment on them), and make phone calls or text messages almost ubiquitously. Given this scenario, in this article, we tackle a fundamental aspect of this new era of communication: How the time intervals between communication events behave for different technologies and means of communications. Are there universal patterns for the Inter-Event Time Distribution (IED)? How do inter-event times behave differently among particular technologies? To answer these questions, we analyzed eight different datasets from real and modern communication data and found four well-defined patterns seen in all the eight datasets. Moreover, we propose the use of the Self-Feeding Process (SFP) to generate inter-event times between communications. The SFP is an extremely parsimonious point process that requires at most two parameters and is able to generate inter-event times with all the universal properties we observed in the data. We also show three potential applications of the SFP: as a framework to generate a synthetic dataset containing realistic communication events of any one of the analyzed means of communications, as a technique to detect anomalies, and as a building block for more specific models that aim to encompass the particularities seen in each of the analyzed systems.", acknowledgement = ack-nhfb, articleno = "24", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2015:WIY, author = "Jing Zhang and Jie Tang and Juanzi Li and Yang Liu and Chunxiao Xing", title = "Who Influenced You? {Predicting} Retweet via Social Influence Locality", journal = j-TKDD, volume = "9", number = "3", pages = "25:1--25:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700398", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Social influence occurs when one's opinions, emotions, or behaviors are affected by others in a social network. However, social influence takes many forms, and its underlying mechanism is still unclear. For example, how is one's behavior influenced by a group of friends who know each other and by the friends from different ego friend circles? In this article, we study the social influence problem in a large microblogging network. Particularly, we consider users' (re)tweet behaviors and focus on investigating how friends in one's ego network influence retweet behaviors. We propose a novel notion of social influence locality and develop two instantiation functions based on pairwise influence and structural diversity. The defined influence locality functions have strong predictive power. Without any additional features, we can obtain an F1-score of 71.65\% for predicting users' retweet behaviors by training a logistic regression classifier based on the defined influence locality functions. We incorporate social influence locality into a factor graph model, which can further leverage the network-based correlation. Our experiments on the large microblogging network show that the model significantly improves the precision of retweet prediction. Our analysis also reveals several intriguing discoveries. For example, if you have six friends retweeting a microblog, the average likelihood that you will also retweet it strongly depends on the structure among the six friends: The likelihood will significantly drop (only 1/6 ) when the six friends do not know each other, compared with the case when the six friends know each other.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xie:2015:MMA, author = "Hong Xie and John C. S. Lui", title = "Mathematical Modeling and Analysis of Product Rating with Partial Information", journal = j-TKDD, volume = "9", number = "4", pages = "26:1--26:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700386", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Many Web services like Amazon, Epinions, and TripAdvisor provide historical product ratings so that users can evaluate the quality of products. Product ratings are important because they affect how well a product will be adopted by the market. The challenge is that we only have partial information on these ratings: each user assigns ratings to only a small subset of products. Under this partial information setting, we explore a number of fundamental questions. What is the minimum number of ratings a product needs so that one can make a reliable evaluation of its quality? How may users' misbehavior, such as cheating in product rating, affect the evaluation result? To answer these questions, we present a probabilistic model to capture various important factors (e.g., rating aggregation rules, rating behavior) that may influence the product quality assessment under the partial information setting. We derive the minimum number of ratings needed to produce a reliable indicator on the quality of a product. We extend our model to accommodate users' misbehavior in product rating. We derive the maximum fraction of misbehaving users that a rating aggregation rule can tolerate and the minimum number of ratings needed to compensate. We carry out experiments using both synthetic and real-world data (from Amazon and TripAdvisor). We not only validate our model but also show that the ``average rating rule'' produces more reliable and robust product quality assessments than the ``majority rating rule'' and the ``median rating rule'' in aggregating product ratings. Last, we perform experiments on two movie rating datasets (from Flixster and Netflix) to demonstrate how to apply our framework to improve the applications of recommender systems.", acknowledgement = ack-nhfb, articleno = "26", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Esuli:2015:OTQ, author = "Andrea Esuli and Fabrizio Sebastiani", title = "Optimizing Text Quantifiers for Multivariate Loss Functions", journal = j-TKDD, volume = "9", number = "4", pages = "27:1--27:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700406", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We address the problem of quantification, a supervised learning task whose goal is, given a class, to estimate the relative frequency (or prevalence ) of the class in a dataset of unlabeled items. Quantification has several applications in data and text mining, such as estimating the prevalence of positive reviews in a set of reviews of a given product or estimating the prevalence of a given support issue in a dataset of transcripts of phone calls to tech support. So far, quantification has been addressed by learning a general-purpose classifier, counting the unlabeled items that have been assigned the class, and tuning the obtained counts according to some heuristics. In this article, we depart from the tradition of using general-purpose classifiers and use instead a supervised learning model for structured prediction, capable of generating classifiers directly optimized for the (multivariate and nonlinear) function used for evaluating quantification accuracy. The experiments that we have run on 5,500 binary high-dimensional datasets (averaging more than 14,000 documents each) show that this method is more accurate, more stable, and more efficient than existing state-of-the-art quantification methods.", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lin:2015:IMS, author = "Bing-Rong Lin and Daniel Kifer", title = "Information Measures in Statistical Privacy and Data Processing Applications", journal = j-TKDD, volume = "9", number = "4", pages = "28:1--28:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700407", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In statistical privacy, utility refers to two concepts: information preservation, how much statistical information is retained by a sanitizing algorithm, and usability, how (and with how much difficulty) one extracts this information to build statistical models, answer queries, and so forth. Some scenarios incentivize a separation between information preservation and usability, so that the data owner first chooses a sanitizing algorithm to maximize a measure of information preservation, and, afterward, the data consumers process the sanitized output according to their various individual needs [Ghosh et al. 2009; Williams and McSherry 2010]. We analyze the information-preserving properties of utility measures with a combination of two new and three existing utility axioms and study how violations of an axiom can be fixed. We show that the average (over possible outputs of the sanitizer) error of Bayesian decision makers forms the unique class of utility measures that satisfy all of the axioms. The axioms are agnostic to Bayesian concepts such as subjective probabilities and hence strengthen support for Bayesian views in privacy research. In particular, this result connects information preservation to aspects of usability-if the information preservation of a sanitizing algorithm should be measured as the average error of a Bayesian decision maker, shouldn't Bayesian decision theory be a good choice when it comes to using the sanitized outputs for various purposes? We put this idea to the test in the unattributed histogram problem where our decision-theoretic postprocessing algorithm empirically outperforms previously proposed approaches.", acknowledgement = ack-nhfb, articleno = "28", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2015:DAC, author = "Hao Huang and Shinjae Yoo and Dantong Yu and Hong Qin", title = "Density-Aware Clustering Based on Aggregated Heat Kernel and Its Transformation", journal = j-TKDD, volume = "9", number = "4", pages = "29:1--29:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700385", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Current spectral clustering algorithms suffer from the sensitivity to existing noise and parameter scaling and may not be aware of different density distributions across clusters. If these problems are left untreated, the consequent clustering results cannot accurately represent true data patterns, in particular, for complex real-world datasets with heterogeneous densities. This article aims to solve these problems by proposing a diffusion-based Aggregated Heat Kernel (AHK) to improve the clustering stability, and a Local Density Affinity Transformation (LDAT) to correct the bias originating from different cluster densities. AHK statistically models the heat diffusion traces along the entire time scale, so it ensures robustness during the clustering process, while LDAT probabilistically reveals the local density of each instance and suppresses the local density bias in the affinity matrix. Our proposed framework integrates these two techniques systematically. As a result, it not only provides an advanced noise-resisting and density-aware spectral mapping to the original dataset but also demonstrates the stability during the processing of tuning the scaling parameter (which usually controls the range of neighborhood). Furthermore, our framework works well with the majority of similarity kernels, which ensures its applicability to many types of data and problem domains. The systematic experiments on different applications show that our proposed algorithm outperforms state-of-the-art clustering algorithms for the data with heterogeneous density distributions and achieves robust clustering performance with respect to tuning the scaling parameter and handling various levels and types of noise.", acknowledgement = ack-nhfb, articleno = "29", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2015:CSF, author = "Kui Yu and Wei Ding and Dan A. Simovici and Hao Wang and Jian Pei and Xindong Wu", title = "Classification with Streaming Features: an Emerging-Pattern Mining Approach", journal = j-TKDD, volume = "9", number = "4", pages = "30:1--30:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700409", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Many datasets from real-world applications have very high-dimensional or increasing feature space. It is a new research problem to learn and maintain a classifier to deal with very high dimensionality or streaming features. In this article, we adapt the well-known emerging-pattern--based classification models and propose a semi-streaming approach. For streaming features, it is computationally expensive or even prohibitive to mine long-emerging patterns, and it is nontrivial to integrate emerging-pattern mining with feature selection. We present an online feature selection step, which is capable of selecting and maintaining a pool of effective features from a feature stream. Then, in our offline step, separated from the online step, we periodically compute and update emerging patterns from the pool of selected features from the online step. We evaluate the effectiveness and efficiency of the proposed method using a series of benchmark datasets and a real-world case study on Mars crater detection. Our proposed method yields classification performance comparable to the state-of-art static classification methods. Most important, the proposed method is significantly faster and can efficiently handle datasets with streaming features.", acknowledgement = ack-nhfb, articleno = "30", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2015:SEH, author = "Guimei Liu and Haojun Zhang and Mengling Feng and Limsoon Wong and See-Kiong Ng", title = "Supporting Exploratory Hypothesis Testing and Analysis", journal = j-TKDD, volume = "9", number = "4", pages = "31:1--31:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2701430", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Conventional hypothesis testing is carried out in a hypothesis-driven manner. A scientist must first formulate a hypothesis based on what he or she sees and then devise a variety of experiments to test it. Given the rapid growth of data, it has become virtually impossible for a person to manually inspect all data to find all of the interesting hypotheses for testing. In this article, we propose and develop a data-driven framework for automatic hypothesis testing and analysis. We define a hypothesis as a comparison between two or more subpopulations. We find subpopulations for comparison using frequent pattern mining techniques and then pair them up for statistical hypothesis testing. We also generate additional information for further analysis of the hypotheses that are deemed significant. The number of hypotheses generated can be very large, and many of them are very similar. We develop algorithms to remove redundant hypotheses and present a succinct set of significant hypotheses to users. We conducted a set of experiments to show the efficiency and effectiveness of the proposed algorithms. The results show that our system can help users (1) identify significant hypotheses efficiently, (2) isolate the reasons behind significant hypotheses efficiently, and (3) find confounding factors that form Simpson's paradoxes with discovered significant hypotheses.", acknowledgement = ack-nhfb, articleno = "31", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Greco:2015:PDU, author = "Gianluigi Greco and Antonella Guzzo and Francesco Lupia and Luigi Pontieri", title = "Process Discovery under Precedence Constraints", journal = j-TKDD, volume = "9", number = "4", pages = "32:1--32:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2710020", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Process discovery has emerged as a powerful approach to support the analysis and the design of complex processes. It consists of analyzing a set of traces registering the sequence of tasks performed along several enactments of a transactional system, in order to build a process model that can explain all the episodes recorded over them. An approach to accomplish this task is presented that can benefit from the background knowledge that, in many cases, is available to the analysts taking care of the process (re-)design. The approach is based on encoding the information gathered from the log and the (possibly) given background knowledge in terms of precedence constraints, that is, of constraints over the topology of the resulting process models. Mining algorithms are eventually formulated in terms of reasoning problems over precedence constraints, and the computational complexity of such problems is thoroughly analyzed by tracing their tractability frontier. Solution algorithms are proposed and their properties analyzed. These algorithms have been implemented in a prototype system, and results of a thorough experimental activity are discussed.", acknowledgement = ack-nhfb, articleno = "32", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mirbakhsh:2015:ITR, author = "Nima Mirbakhsh and Charles X. Ling", title = "Improving Top-{$N$} Recommendation for Cold-Start Users via Cross-Domain Information", journal = j-TKDD, volume = "9", number = "4", pages = "33:1--33:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2724720", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Making accurate recommendations for cold-start users is a challenging yet important problem in recommendation systems. Including more information from other domains is a natural solution to improve the recommendations. However, most previous work in cross-domain recommendations has focused on improving prediction accuracy with several severe limitations. In this article, we extend our previous work on clustering-based matrix factorization in single domains into cross domains. In addition, we utilize recent results on unobserved ratings. Our new method can more effectively utilize data from auxiliary domains to achieve better recommendations, especially for cold-start users. For example, our method improves the recall to 21\% on average for cold-start users, whereas previous methods result in only 15\% recall in the cross-domain Amazon dataset. We also observe almost the same improvements in the Epinions dataset. Considering that it is often difficult to make even a small improvement in recommendations, for cold-start users in particular, our result is quite significant.", acknowledgement = ack-nhfb, articleno = "33", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bonchi:2015:CCC, author = "Francesco Bonchi and Aristides Gionis and Francesco Gullo and Charalampos E. Tsourakakis and Antti Ukkonen", title = "Chromatic Correlation Clustering", journal = j-TKDD, volume = "9", number = "4", pages = "34:1--34:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2728170", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study a novel clustering problem in which the pairwise relations between objects are categorical. This problem can be viewed as clustering the vertices of a graph whose edges are of different types ( colors ). We introduce an objective function that ensures the edges within each cluster have, as much as possible, the same color. We show that the problem is NP -hard and propose a randomized algorithm with approximation guarantee proportional to the maximum degree of the input graph. The algorithm iteratively picks a random edge as a pivot, builds a cluster around it, and removes the cluster from the graph. Although being fast, easy to implement, and parameter-free, this algorithm tends to produce a relatively large number of clusters. To overcome this issue we introduce a variant algorithm, which modifies how the pivot is chosen and how the cluster is built around the pivot. Finally, to address the case where a fixed number of output clusters is required, we devise a third algorithm that directly optimizes the objective function based on the alternating-minimization paradigm. We also extend our objective function to handle cases where object's relations are described by multiple labels. We modify our randomized approximation algorithm to optimize such an extended objective function and show that its approximation guarantee remains proportional to the maximum degree of the graph. We test our algorithms on synthetic and real data from the domains of social media, protein-interaction networks, and bibliometrics. Results reveal that our algorithms outperform a baseline algorithm both in the task of reconstructing a ground-truth clustering and in terms of objective-function value.", acknowledgement = ack-nhfb, articleno = "34", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2015:LSC, author = "Hua Wang and Feiping Nie and Heng Huang", title = "Large-Scale Cross-Language {Web} Page Classification via Dual Knowledge Transfer Using Fast Nonnegative Matrix Trifactorization", journal = j-TKDD, volume = "10", number = "1", pages = "1:1--1:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2710021", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "With the rapid growth of modern technologies, Internet has reached almost every corner of the world. As a result, it becomes more and more important to manage and mine information contained in Web pages in different languages. Traditional supervised learning methods usually require a large amount of training data to obtain accurate and robust classification models. However, labeled Web pages did not increase as fast as the growth of Internet. The lack of sufficient training Web pages in many languages, especially for those in uncommonly used languages, makes it a challenge for traditional classification algorithms to achieve satisfactory performance. To address this, we observe that Web pages for a same topic from different languages usually share some common semantic patterns, though in different representation forms. In addition, we also observe that the associations between word clusters and Web page classes are another type of reliable carriers to transfer knowledge across languages. With these recognitions, in this article we propose a novel joint nonnegative matrix trifactorization (NMTF) based Dual Knowledge Transfer (DKT) approach for cross-language Web page classification. Our approach transfers knowledge from the auxiliary language, in which abundant labeled Web pages are available, to the target languages, in which we want to classify Web pages, through two different paths: word cluster approximation and the associations between word clusters and Web page classes. With the reinforcement between these two different knowledge transfer paths, our approach can achieve better classification accuracy. In order to deal with the large-scale real world data, we further develop the proposed DKT approach by constraining the factor matrices of NMTF to be cluster indicator matrices. Due to the nature of cluster indicator matrices, we can decouple the proposed optimization objective and the resulted subproblems are of much smaller sizes involving much less matrix multiplications, which make our new approach much more computationally efficient. We evaluate the proposed approach in extensive experiments using a real world cross-language Web page data set. Promising results have demonstrated the effectiveness of our approach that are consistent with our theoretical analyses.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhou:2015:SIB, author = "Yang Zhou and Ling Liu", title = "Social Influence Based Clustering and Optimization over Heterogeneous Information Networks", journal = j-TKDD, volume = "10", number = "1", pages = "2:1--2:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2717314", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Social influence analysis has shown great potential for strategic marketing decision. It is well known that people influence one another based on both their social connections and the social activities that they have engaged in the past. In this article, we develop an innovative and high-performance social influence based graph clustering framework with four unique features. First, we explicitly distinguish social connection based influence (self-influence) and social activity based influence (co-influence). We compute the self-influence similarity between two members based on their social connections within a single collaboration network, and compute the co-influence similarity by taking into account not only the set of activities that people participate but also the semantic association between these activities. Second, we define the concept of influence-based similarity by introducing a unified influence-based similarity matrix that employs an iterative weight update method to integrate self-influence and co-influence similarities. Third, we design a dynamic learning algorithm, called SI-C luster, for social influence based graph clustering. It iteratively partitions a large social collaboration network into K clusters based on both the social network itself and the multiple associated activity information networks, each representing a category of activities that people have engaged. To make the SI-Cluster algorithm converge fast, we transform sophisticated nonlinear fractional programming problem with respect to multiple weights into a straightforward nonlinear parametric programming problem of single variable. Finally, we develop an optimization technique of diagonalizable-matrix approximation to speed up the computation of self-influence similarity and co-influence similarities. Our SI-Cluster-Opt significantly improves the efficiency of SI-Cluster on large graphs while maintaining high quality of clustering results. Extensive experimental evaluation on three real-world graphs shows that, compared to existing representative graph clustering algorithms, our SI-Cluster-Opt approach not only achieves a very good balance between self-influence and co-influence similarities but also scales extremely well for clustering large graphs in terms of time complexity while meeting the guarantee of high density, low entropy and low Davies--Bouldin Index.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Papalexakis:2015:PSP, author = "Evangelos E. Papalexakis and Christos Faloutsos and Nicholas D. Sidiropoulos", title = "{ParCube}: Sparse Parallelizable {CANDECOMP--PARAFAC} Tensor Decomposition", journal = j-TKDD, volume = "10", number = "1", pages = "3:1--3:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2729980", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How can we efficiently decompose a tensor into sparse factors, when the data do not fit in memory? Tensor decompositions have gained a steadily increasing popularity in data-mining applications; however, the current state-of-art decomposition algorithms operate on main memory and do not scale to truly large datasets. In this work, we propose ParCube, a new and highly parallelizable method for speeding up tensor decompositions that is well suited to produce sparse approximations. Experiments with even moderately large data indicate over 90\% sparser outputs and 14 times faster execution, with approximation error close to the current state of the art irrespective of computation and memory requirements. We provide theoretical guarantees for the algorithm's correctness and we experimentally validate our claims through extensive experiments, including four different real world datasets (Enron, Lbnl, Facebook and Nell), demonstrating its effectiveness for data-mining practitioners. In particular, we are the first to analyze the very large Nell dataset using a sparse tensor decomposition, demonstrating that ParCube enables us to handle effectively and efficiently very large datasets. Finally, we make our highly scalable parallel implementation publicly available, enabling reproducibility of our work.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ahmed:2015:AMC, author = "Rezwan Ahmed and George Karypis", title = "Algorithms for Mining the Coevolving Relational Motifs in Dynamic Networks", journal = j-TKDD, volume = "10", number = "1", pages = "4:1--4:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2733380", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Computational methods and tools that can efficiently and effectively analyze the temporal changes in dynamic complex relational networks enable us to gain significant insights regarding the entity relations and their evolution. This article introduces a new class of dynamic graph patterns, referred to as coevolving relational motifs (CRMs), which are designed to identify recurring sets of entities whose relations change in a consistent way over time. CRMs can provide evidence to the existence of, possibly unknown, coordination mechanisms by identifying the relational motifs that evolve in a similar and highly conserved fashion. We developed an algorithm to efficiently analyze the frequent relational changes between the entities of the dynamic networks and capture all frequent coevolutions as CRMs. Our algorithm follows a depth-first exploration of the frequent CRM lattice and incorporates canonical labeling for redundancy elimination. Experimental results based on multiple real world dynamic networks show that the method is able to efficiently identify CRMs. In addition, a qualitative analysis of the results shows that the discovered patterns can be used as features to characterize the dynamic network.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Campello:2015:HDE, author = "Ricardo J. G. B. Campello and Davoud Moulavi and Arthur Zimek and J{\"o}rg Sander", title = "Hierarchical Density Estimates for Data Clustering, Visualization, and Outlier Detection", journal = j-TKDD, volume = "10", number = "1", pages = "5:1--5:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2733381", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "An integrated framework for density-based cluster analysis, outlier detection, and data visualization is introduced in this article. The main module consists of an algorithm to compute hierarchical estimates of the level sets of a density, following Hartigan's classic model of density-contour clusters and trees. Such an algorithm generalizes and improves existing density-based clustering techniques with respect to different aspects. It provides as a result a complete clustering hierarchy composed of all possible density-based clusters following the nonparametric model adopted, for an infinite range of density thresholds. The resulting hierarchy can be easily processed so as to provide multiple ways for data visualization and exploration. It can also be further postprocessed so that: (i) a normalized score of ``outlierness'' can be assigned to each data object, which unifies both the global and local perspectives of outliers into a single definition; and (ii) a ``flat'' (i.e., nonhierarchical) clustering solution composed of clusters extracted from local cuts through the cluster tree (possibly corresponding to different density thresholds) can be obtained, either in an unsupervised or in a semisupervised way. In the unsupervised scenario, the algorithm corresponding to this postprocessing module provides a global, optimal solution to the formal problem of maximizing the overall stability of the extracted clusters. If partially labeled objects or instance-level constraints are provided by the user, the algorithm can solve the problem by considering both constraints violations/satisfactions and cluster stability criteria. An asymptotic complexity analysis, both in terms of running time and memory space, is described. Experiments are reported that involve a variety of synthetic and real datasets, including comparisons with state-of-the-art, density-based clustering and (global and local) outlier detection methods.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Berardi:2015:UTR, author = "Giacomo Berardi and Andrea Esuli and Fabrizio Sebastiani", title = "Utility-Theoretic Ranking for Semiautomated Text Classification", journal = j-TKDD, volume = "10", number = "1", pages = "6:1--6:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2742548", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Semiautomated Text Classification (SATC) may be defined as the task of ranking a set D of automatically labelled textual documents in such a way that, if a human annotator validates (i.e., inspects and corrects where appropriate) the documents in a top-ranked portion of D with the goal of increasing the overall labelling accuracy of D, the expected increase is maximized. An obvious SATC strategy is to rank D so that the documents that the classifier has labelled with the lowest confidence are top ranked. In this work, we show that this strategy is suboptimal. We develop new utility-theoretic ranking methods based on the notion of validation gain, defined as the improvement in classification effectiveness that would derive by validating a given automatically labelled document. We also propose a new effectiveness measure for SATC-oriented ranking methods, based on the expected reduction in classification error brought about by partially validating a list generated by a given ranking method. We report the results of experiments showing that, with respect to the baseline method mentioned earlier, and according to the proposed measure, our utility-theoretic ranking methods can achieve substantially higher expected reductions in classification error.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2015:DIP, author = "Zhiwen Yu and Zhu Wang and Huilei He and Jilei Tian and Xinjiang Lu and Bin Guo", title = "Discovering Information Propagation Patterns in Microblogging Services", journal = j-TKDD, volume = "10", number = "1", pages = "7:1--7:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2742801", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "During the last decade, microblog has become an important social networking service with billions of users all over the world, acting as a novel and efficient platform for the creation and dissemination of real-time information. Modeling and revealing the information propagation patterns in microblogging services cannot only lead to more accurate understanding of user behaviors and provide insights into the underlying sociology, but also enable useful applications such as trending prediction, recommendation and filtering, spam detection and viral marketing. In this article, we aim to reveal the information propagation patterns in Sina Weibo, the biggest microblogging service in China. First, the cascade of each message is represented as a tree based on its retweeting process. Afterwards, we divide the information propagation pattern into two levels, that is, the macro level and the micro level. On one hand, the macro propagation patterns refer to general propagation modes that are extracted by grouping propagation trees based on hierarchical clustering. On the other hand, the micro propagation patterns are frequent information flow patterns that are discovered using tree-based mining techniques. Experimental results show that several interesting patterns are extracted, such as popular message propagation, artificial propagation, and typical information flows between different types of users.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2015:SMB, author = "Xianchao Zhang and Xiaotong Zhang and Han Liu", title = "Smart Multitask {Bregman} Clustering and Multitask Kernel Clustering", journal = j-TKDD, volume = "10", number = "1", pages = "8:1--8:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2747879", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Traditional clustering algorithms deal with a single clustering task on a single dataset. However, there are many related tasks in the real world, which motivates multitask clustering. Recently some multitask clustering algorithms have been proposed, and among them multitask Bregman clustering (MBC) is a very applicable method. MBC alternatively updates clusters and learns relationships between clusters of different tasks, and the two phases boost each other. However, the boosting does not always have positive effects on improving the clustering performance, it may also cause negative effects. Another issue of MBC is that it cannot deal with nonlinear separable data. In this article, we show that in MBC, the process of using cluster relationship to boost the cluster updating phase may cause negative effects, that is, cluster centroids may be skewed under some conditions. We propose a smart multitask Bregman clustering (S-MBC) algorithm which can identify the negative effects of the boosting and avoid the negative effects if they occur. We then propose a multitask kernel clustering (MKC) framework for nonlinear separable data by using a similar framework like MBC in the kernel space. We also propose a specific optimization method, which is quite different from that of MBC, to implement the MKC framework. Since MKC can also cause negative effects like MBC, we further extend the framework of MKC to a smart multitask kernel clustering (S-MKC) framework in a similar way that S-MBC is extended from MBC. We conduct experiments on 10 real world multitask clustering datasets to evaluate the performance of S-MBC and S-MKC. The results on clustering accuracy show that: (1) compared with the original MBC algorithm MBC, S-MBC and S-MKC perform much better; (2) compared with the convex discriminative multitask relationship clustering (DMTRC) algorithms DMTRC-L and DMTRC-R which also avoid negative transfer, S-MBC and S-MKC perform worse in the (ideal) case in which different tasks have the same cluster number and the empirical label marginal distribution in each task distributes evenly, but better or comparable in other (more general) cases. Moreover, S-MBC and S-MKC can work on the datasets in which different tasks have different number of clusters, violating the assumptions of DMTRC-L and DMTRC-R. The results on efficiency show that S-MBC and S-MKC consume more computational time than MBC and less computational time than DMTRC-L and DMTRC-R. Overall S-MBC and S-MKC are competitive compared with the state-of-the-art multitask clustering algorithms in synthetical terms of accuracy, efficiency and applicability.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wei:2015:MTP, author = "Wei Wei and Kathleen M. Carley", title = "Measuring Temporal Patterns in Dynamic Social Networks", journal = j-TKDD, volume = "10", number = "1", pages = "9:1--9:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2749465", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given social networks over time, how can we measure network activities across different timesteps with a limited number of metrics? We propose two classes of dynamic metrics for assessing temporal evolution patterns of agents in terms of persistency and emergence. For each class of dynamic metrics, we implement it using three different temporal aggregation models ranging from the most commonly used Average Aggregation Model to more the complex models such as the Exponential Aggregation Model. We argue that the problem of measuring temporal patterns can be formulated using Recency and Primacy effect, which is a concept used to characterize human cognitive processes. Experimental results show that the way metrics model Recency--Primacy effect is closely related to their abilities to measure temporal patterns. Furthermore, our results indicate that future network agent activities can be predicted based on history information using dynamic metrics. By conducting multiple experiments, we are also able to find an optimal length of history information that is most relevant to future activities. This optimal length is highly consistent within a dataset and can be used as an intrinsic metric to evaluate a dynamic social network.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2015:RAT, author = "Siyuan Liu and Qiang Qu and Shuhui Wang", title = "Rationality Analytics from Trajectories", journal = j-TKDD, volume = "10", number = "1", pages = "10:1--10:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2735634", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The availability of trajectories tracking the geographical locations of people as a function of time offers an opportunity to study human behaviors. In this article, we study rationality from the perspective of user decision on visiting a point of interest (POI) which is represented as a trajectory. However, the analysis of rationality is challenged by a number of issues, for example, how to model a trajectory in terms of complex user decision processes? and how to detect hidden factors that have significant impact on the rational decision making? In this study, we propose Rationality Analysis Model (RAM) to analyze rationality from trajectories in terms of a set of impact factors. In order to automatically identify hidden factors, we propose a method, Collective Hidden Factor Retrieval (CHFR), which can also be generalized to parse multiple trajectories at the same time or parse individual trajectories of different time periods. Extensive experimental study is conducted on three large-scale real-life datasets (i.e., taxi trajectories, user shopping trajectories, and visiting trajectories in a theme park). The results show that the proposed methods are efficient, effective, and scalable. We also deploy a system in a large theme park to conduct a field study. Interesting findings and user feedback of the field study are provided to support other applications in user behavior mining and analysis, such as business intelligence and user management for marketing purposes.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", }