%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.24", %%% date = "29 August 2016", %%% time = "07:31:16 MDT", %%% filename = "tkdd.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "43461 12590 70691 672777", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "ACM Transactions on Knowledge Discovery from %%% Data (TKDD); bibliography; TKDD", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Knowledge Discovery from %%% Data (TKDD) (CODEN ????, ISSN 1556-4681), %%% covering all journal issues from 2007 -- %%% date. %%% %%% At version 1.24, the COMPLETE journal %%% coverage looked like this: %%% %%% 2007 ( 14) 2011 ( 11) 2015 ( 41) %%% 2008 ( 18) 2012 ( 26) 2016 ( 40) %%% 2009 ( 25) 2013 ( 20) %%% 2010 ( 26) 2014 ( 37) %%% %%% Article: 258 %%% %%% Total entries: 258 %%% %%% The journal Web page can be found at: %%% %%% http://www.acm.org/pubs/tkdd.html %%% %%% The journal table of contents page is at: %%% %%% http://www.acm.org/tkdd/ %%% http://portal.acm.org/browse_dl.cfm?idx=J1054 %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ====================================================================

@Preamble{"\input bibnames.sty" # "\def \TM {${}^{\sc TM}$}" }

%%% ==================================================================== %%% Acknowledgement abbreviations:

@String{ack-nhfb= "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ==================================================================== %%% Journal abbreviations:

@String{j-TKDD= "ACM Transactions on Knowledge Discovery from Data (TKDD)"}

%%% ==================================================================== %%% Bibliography entries:

@Article{Han:2007:I, author = "Jiawei Han", title = "Introduction", journal = j-TKDD, volume = "1", number = "1", pages = "1:1--1:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217300", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Leskovec:2007:GED, author = "Jure Leskovec and Jon Kleinberg and Christos Faloutsos", title = "Graph evolution: {Densification} and shrinking diameters", journal = j-TKDD, volume = "1", number = "1", pages = "2:1--2:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217301", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How do real graphs evolve over time? What are normal growth patterns in social, technological, and information networks? Many studies have discovered patterns in {\em static graphs}, identifying properties in a single snapshot of a large network or in a very small number of snapshots; these include heavy tails for in- and out-degree distributions, communities, small-world phenomena, and others. However, given the lack of information about network evolution over long periods, it has been hard to convert these findings into statements about trends over time.\par Here we study a wide range of real graphs, and we observe some surprising phenomena. First, most of these graphs densify over time with the number of edges growing superlinearly in the number of nodes. Second, the average distance between nodes often shrinks over time in contrast to the conventional wisdom that such distance parameters should increase slowly as a function of the number of nodes (like $O(\log n)$ or $O(\log(\log n))$).\par Existing graph generation models do not exhibit these types of behavior even at a qualitative level. We provide a new graph generator, based on a forest fire spreading process that has a simple, intuitive justification, requires very few parameters (like the flammability of nodes), and produces graphs exhibiting the full range of properties observed both in prior work and in the present study.\par We also notice that the forest fire model exhibits a sharp transition between sparse graphs and graphs that are densifying. Graphs with decreasing distance between the nodes are generated around this transition point.\par Last, we analyze the connection between the temporal evolution of the degree distribution and densification of a graph. We find that the two are fundamentally related. We also observe that real networks exhibit this type of relation between densification and the degree distribution.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Densification power laws; graph generators; graph mining; heavy-tailed distributions; small-world phenomena", } @Article{Machanavajjhala:2007:DPB, author = "Ashwin Machanavajjhala and Daniel Kifer and Johannes Gehrke and Muthuramakrishnan Venkitasubramaniam", title = "{{$L$}}-diversity: {Privacy} beyond $k$-anonymity", journal = j-TKDD, volume = "1", number = "1", pages = "3:1--3:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217302", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Publishing data about individuals without revealing sensitive information about them is an important problem. In recent years, a new definition of privacy called $k$-anonymity has gained popularity. In a $k$-anonymized dataset, each record is indistinguishable from at least $k - 1$ other records with respect to certain identifying attributes.\par In this article, we show using two simple attacks that a $k$-anonymized dataset has some subtle but severe privacy problems. First, an attacker can discover the values of sensitive attributes when there is little diversity in those sensitive attributes. This is a known problem. Second, attackers often have background knowledge, and we show that $k$-anonymity does not guarantee privacy against attackers using background knowledge. We give a detailed analysis of these two attacks, and we propose a novel and powerful privacy criterion called $\ell$-diversity that can defend against such attacks. In addition to building a formal foundation for $\ell$-diversity, we show in an experimental evaluation that $\ell$-diversity is practical and can be implemented efficiently.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "-diversity; Data privacy; ell-k-anonymity; privacy-preserving data publishing", } @Article{Gionis:2007:CA, author = "Aristides Gionis and Heikki Mannila and Panayiotis Tsaparas", title = "Clustering aggregation", journal = j-TKDD, volume = "1", number = "1", pages = "4:1--4:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217303", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We consider the following problem: given a set of clusterings, find a single clustering that agrees as much as possible with the input clusterings. This problem, {\em clustering aggregation}, appears naturally in various contexts. For example, clustering categorical data is an instance of the clustering aggregation problem; each categorical attribute can be viewed as a clustering of the input rows where rows are grouped together if they take the same value on that attribute. Clustering aggregation can also be used as a metaclustering method to improve the robustness of clustering by combining the output of multiple algorithms. Furthermore, the problem formulation does not require a priori information about the number of clusters; it is naturally determined by the optimization function.\par In this article, we give a formal statement of the clustering aggregation problem, and we propose a number of algorithms. Our algorithms make use of the connection between clustering aggregation and the problem of {\em correlation clustering}. Although the problems we consider are NP-hard, for several of our methods, we provide theoretical guarantees on the quality of the solutions. Our work provides the best deterministic approximation algorithm for the variation of the correlation clustering problem we consider. We also show how sampling can be used to scale the algorithms for large datasets. We give an extensive empirical evaluation demonstrating the usefulness of the problem and of the solutions.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "clustering aggregation; clustering categorical data; correlation clustering; Data clustering", } @Article{Bhattacharya:2007:CER, author = "Indrajit Bhattacharya and Lise Getoor", title = "Collective entity resolution in relational data", journal = j-TKDD, volume = "1", number = "1", pages = "5:1--5:??", month = mar, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1217299.1217304", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:36 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Many databases contain uncertain and imprecise references to real-world entities. The absence of identifiers for the underlying entities often results in a database which contains multiple references to the same entity. This can lead not only to data redundancy, but also inaccuracies in query processing and knowledge extraction. These problems can be alleviated through the use of {\em entity resolution}. Entity resolution involves discovering the underlying entities and mapping each database reference to these entities. Traditionally, entities are resolved using pairwise similarity over the attributes of references. However, there is often additional relational information in the data. Specifically, references to different entities may cooccur. In these cases, collective entity resolution, in which entities for cooccurring references are determined jointly rather than independently, can improve entity resolution accuracy. We propose a novel relational clustering algorithm that uses both attribute and relational information for determining the underlying domain entities, and we give an efficient implementation. We investigate the impact that different relational similarity measures have on entity resolution quality. We evaluate our collective entity resolution algorithm on multiple real-world databases. We show that it improves entity resolution performance over both attribute-based baselines and over algorithms that consider relational information but do not resolve entities collectively. In addition, we perform detailed experiments on synthetically generated data to identify data characteristics that favor collective relational resolution over purely attribute-based algorithms.", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "data cleaning; Entity resolution; graph clustering; record linkage", } @Article{Loh:2007:EEL, author = "Wei-Yin Loh and Chien-Wei Chen and Wei Zheng", title = "Extrapolation errors in linear model trees", journal = j-TKDD, volume = "1", number = "2", pages = "6:1--6:??", month = aug, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1267066.1267067", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:48 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Prediction errors from a linear model tend to be larger when extrapolation is involved, particularly when the model is wrong. This article considers the problem of extrapolation and interpolation errors when a linear model tree is used for prediction. It proposes several ways to curtail the size of the errors, and uses a large collection of real datasets to demonstrate that the solutions are effective in reducing the average mean squared prediction error. The article also provides a proof that, if a linear model is correct, the proposed solutions have no undesirable effects as the training sample size tends to infinity.", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Decision tree; prediction; regression; statistics", } @Article{Zhang:2007:MPP, author = "Minghua Zhang and Ben Kao and David W. Cheung and Kevin Y. Yip", title = "Mining periodic patterns with gap requirement from sequences", journal = j-TKDD, volume = "1", number = "2", pages = "7:1--7:??", month = aug, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1267066.1267068", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:48 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study a problem of mining frequently occurring periodic patterns with a gap requirement from sequences. Given a character sequence $S$ of length $L$ and a pattern $P$ of length $l$, we consider $P$ a frequently occurring pattern in $S$ if the probability of {\em observing\/} $P$ given a randomly picked length-$l$ subsequence of $S$ exceeds a certain threshold. In many applications, particularly those related to bioinformatics, interesting patterns are {\em periodic\/} with a {\em gap requirement}. That is to say, the characters in $P$ should match subsequences of $S$ in such a way that the matching characters in $S$ are separated by gaps of more or less the same size. We show the complexity of the mining problem and discuss why traditional mining algorithms are computationally infeasible. We propose practical algorithms for solving the problem and study their characteristics. We also present a case study in which we apply our algorithms on some DNA sequences. We discuss some interesting patterns obtained from the case study.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "gap requirement; periodic pattern; Sequence mining", } @Article{Huang:2007:TTE, author = "Jen-Wei Huang and Bi-Ru Dai and Ming-Syan Chen", title = "{Twain}: {Two-end} association miner with precise frequent exhibition periods", journal = j-TKDD, volume = "1", number = "2", pages = "8:1--8:??", month = aug, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1267066.1267069", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:48 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We investigate the general model of mining associations in a temporal database, where the exhibition periods of items are allowed to be different from one to another. The database is divided into partitions according to the time granularity imposed. Such temporal association rules allow us to observe short-term but interesting patterns that are absent when the whole range of the database is evaluated altogether. Prior work may omit some temporal association rules and thus have limited practicability. To remedy this and to give more precise frequent exhibition periods of frequent temporal itemsets, we devise an efficient algorithm {\em Twain\/} (standing for {\em TWo end AssocIation miNer\/} .) {\em Twain\/} not only generates frequent patterns with more precise frequent exhibition periods, but also discovers more interesting frequent patterns. {\em Twain\/} employs Start time and End time of each item to provide precise frequent exhibition period while progressively handling itemsets from one partition to another. Along with one scan of the database, {\em Twain\/} can generate frequent 2-itemsets directly according to the cumulative filtering threshold. Then, {\em Twain\/} adopts the scan reduction technique to generate all frequent $k$-itemsets ($k$ > 2) from the generated frequent 2-itemsets. Theoretical properties of {\em Twain\/} are derived as well in this article. The experimental results show that {\em Twain\/} outperforms the prior works in the quality of frequent patterns, execution time, I/O cost, CPU overhead and scalability.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Association; temporal", } @Article{Bayardop:2007:ISI, author = "Roberto Bayardop and Kristin P. Bennett and Gautam Das and Dimitrios Gunopulos and Johannes Gunopulos", title = "Introduction to special issue {ACM SIGKDD 2006}", journal = j-TKDD, volume = "1", number = "3", pages = "9:1--9:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297333", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bohm:2007:RPF, author = "Christian B{\"o}hm and Christos Faloutsos and Jia-Yu Pan and Claudia Plant", title = "{RIC}: {Parameter-free} noise-robust clustering", journal = j-TKDD, volume = "1", number = "3", pages = "10:1--10:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297334", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How do we find a {\em natural\/} clustering of a real-world point set which contains an unknown number of clusters with different shapes, and which may be contaminated by noise? As most clustering algorithms were designed with certain assumptions (Gaussianity), they often require the user to give input parameters, and are sensitive to noise. In this article, we propose a robust framework for determining a natural clustering of a given dataset, based on the minimum description length (MDL) principle. The proposed framework, {\em robust information-theoretic clustering (RIC)}, is orthogonal to any known clustering algorithm: Given a preliminary clustering, RIC purifies these clusters from noise, and adjusts the clusterings such that it simultaneously determines the most natural amount and shape (subspace) of the clusters. Our RIC method can be combined with any clustering technique ranging from K-means and K-medoids to advanced methods such as spectral clustering. In fact, RIC is even able to purify and improve an initial coarse clustering, even if we start with very simple methods. In an extension, we propose a fully automatic stand-alone clustering method and efficiency improvements. RIC scales well with the dataset size. Extensive experiments on synthetic and real-world datasets validate the proposed RIC framework.", acknowledgement = ack-nhfb, articleno = "10", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Clustering; data summarization; noise robustness; parameter-free data mining", } @Article{Mei:2007:SAF, author = "Qiaozhu Mei and Dong Xin and Hong Cheng and Jiawei Han and Chengxiang Zhai", title = "Semantic annotation of frequent patterns", journal = j-TKDD, volume = "1", number = "3", pages = "11:1--11:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297335", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Using frequent patterns to analyze data has been one of the fundamental approaches in many data mining applications. Research in frequent pattern mining has so far mostly focused on developing efficient algorithms to discover various kinds of frequent patterns, but little attention has been paid to the important next step --- interpreting the discovered frequent patterns. Although the compression and summarization of frequent patterns has been studied in some recent work, the proposed techniques there can only annotate a frequent pattern with nonsemantical information (e.g., support), which provides only limited help for a user to understand the patterns.\par In this article, we study the novel problem of generating semantic annotations for frequent patterns. The goal is to discover the hidden meanings of a frequent pattern by annotating it with in-depth, concise, and structured information. We propose a general approach to generate such an annotation for a frequent pattern by constructing its context model, selecting informative context indicators, and extracting representative transactions and semantically similar patterns. This general approach can well incorporate the user's prior knowledge, and has potentially many applications, such as generating a dictionary-like description for a pattern, finding synonym patterns, discovering semantic relations, and summarizing semantic classes of a set of frequent patterns. Experiments on different datasets show that our approach is effective in generating semantic pattern annotations.", acknowledgement = ack-nhfb, articleno = "11", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Frequent pattern; pattern annotation; pattern context; pattern semantic analysis", } @Article{Koren:2007:MEP, author = "Yehuda Koren and Stephen C. North and Chris Volinsky", title = "Measuring and extracting proximity graphs in networks", journal = j-TKDD, volume = "1", number = "3", pages = "12:1--12:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297336", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Measuring distance or some other form of proximity between objects is a standard data mining tool. Connection subgraphs were recently proposed as a way to demonstrate proximity between nodes in networks. We propose a new way of measuring and extracting proximity in networks called ``cycle-free effective conductance'' (CFEC). Importantly, the measured proximity is accompanied with a {\em proximity subgraph\/} which allows assessing and understanding measured values. Our proximity calculation can handle more than two endpoints, directed edges, is statistically well behaved, and produces an effectiveness score for the computed subgraphs. We provide an efficient algorithm to measure and extract proximity. Also, we report experimental results and show examples for four large network datasets: a telecommunications calling graph, the IMDB actors graph, an academic coauthorship network, and a movie recommendation system.", acknowledgement = ack-nhfb, articleno = "12", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Connection subgraph; cycle-free escape probability; escape probability; graph mining; proximity; proximity subgraph; random walk", } @Article{Ihler:2007:LDE, author = "Alexander Ihler and Jon Hutchins and Padhraic Smyth", title = "Learning to detect events with {Markov}-modulated {Poisson} processes", journal = j-TKDD, volume = "1", number = "3", pages = "13:1--13:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297337", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Time-series of count data occur in many different contexts, including Internet navigation logs, freeway traffic monitoring, and security logs associated with buildings. In this article we describe a framework for detecting anomalous events in such data using an unsupervised learning approach. Normal periodic behavior is modeled via a time-varying Poisson process model, which in turn is modulated by a hidden Markov process that accounts for bursty events. We outline a Bayesian framework for learning the parameters of this model from count time-series. Two large real-world datasets of time-series counts are used as testbeds to validate the approach, consisting of freeway traffic data and logs of people entering and exiting a building. We show that the proposed model is significantly more accurate at detecting known events than a more traditional threshold-based technique. We also describe how the model can be used to investigate different degrees of periodicity in the data, including systematic day-of-week and time-of-day effects, and to make inferences about different aspects of events such as number of vehicles or people involved. The results indicate that the Markov-modulated Poisson framework provides a robust and accurate framework for adaptively and autonomously learning how to separate unusual bursty events from traces of normal human activity.", acknowledgement = ack-nhfb, articleno = "13", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Event detection; Markov modulated; Poisson", } @Article{Gionis:2007:ADM, author = "Aristides Gionis and Heikki Mannila and Taneli Mielik{\"a}inen and Panayiotis Tsaparas", title = "Assessing data mining results via swap randomization", journal = j-TKDD, volume = "1", number = "3", pages = "14:1--14:??", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1297332.1297338", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:58:56 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The problem of assessing the significance of data mining results on high-dimensional 0--1 datasets has been studied extensively in the literature. For problems such as mining frequent sets and finding correlations, significance testing can be done by standard statistical tests such as chi-square, or other methods. However, the results of such tests depend only on the specific attributes and not on the dataset as a whole. Moreover, the tests are difficult to apply to sets of patterns or other complex results of data mining algorithms. In this article, we consider a simple randomization technique that deals with this shortcoming. The approach consists of producing random datasets that have the same row and column margins as the given dataset, computing the results of interest on the randomized instances and comparing them to the results on the actual data. This randomization technique can be used to assess the results of many different types of data mining algorithms, such as frequent sets, clustering, and spectral analysis. To generate random datasets with given margins, we use variations of a Markov chain approach which is based on a simple swap operation. We give theoretical results on the efficiency of different randomization methods, and apply the swap randomization method to several well-known datasets. Our results indicate that for some datasets the structure discovered by the data mining algorithms is expected, given the row and column margins of the datasets, while for other datasets the discovered structure conveys information that is not captured by the margin counts.", acknowledgement = ack-nhfb, articleno = "14", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "0--1 data; randomization tests; Significance testing; swaps", } @Article{Tang:2008:TTA, author = "Lei Tang and Huan Liu and Jianping Zhang and Nitin Agarwal and John J. Salerno", title = "Topic taxonomy adaptation for group profiling", journal = j-TKDD, volume = "1", number = "4", pages = "1:1--1:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324173", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A topic taxonomy is an effective representation that describes salient features of virtual groups or online communities. A topic taxonomy consists of topic nodes. Each internal node is defined by its vertical path (i.e., ancestor and child nodes) and its horizontal list of attributes (or terms). In a text-dominant environment, a topic taxonomy can be used to flexibly describe a group's interests with varying granularity. However, the stagnant nature of a taxonomy may fail to timely capture the dynamic change of a group's interest. This article addresses the problem of how to adapt a topic taxonomy to the accumulated data that reflects the change of a group's interest to achieve dynamic group profiling. We first discuss the issues related to topic taxonomy. We next formulate taxonomy adaptation as an optimization problem to find the taxonomy that best fits the data. We then present a viable algorithm that can efficiently accomplish taxonomy adaptation. We conduct extensive experiments to evaluate our approach's efficacy for group profiling, compare the approach with some alternatives, and study its performance for dynamic group profiling. While pointing out various applications of taxonomy adaption, we suggest some future work that can take advantage of burgeoning Web 2.0 services for online targeted marketing, counterterrorism in connecting dots, and community tracking.", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "dynamic profiling; group interest; taxonomy adjustment; text hierarchical classification; Topic taxonomy", } @Article{Cormode:2008:FHH, author = "Graham Cormode and Flip Korn and S. Muthukrishnan and Divesh Srivastava", title = "Finding hierarchical heavy hitters in streaming data", journal = j-TKDD, volume = "1", number = "4", pages = "2:1--2:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324174", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Data items that arrive online as streams typically have attributes which take values from one or more hierarchies (time and geographic location, source and destination IP addresses, etc.). Providing an aggregate view of such data is important for summarization, visualization, and analysis. We develop an aggregate view based on certain organized sets of large-valued regions (``heavy hitters'') corresponding to hierarchically discounted frequency counts. We formally define the notion of {\em hierarchical heavy hitters\/} (HHHs). We first consider computing (approximate) HHHs over a data stream drawn from a single hierarchical attribute. We formalize the problem and give deterministic algorithms to find them in a single pass over the input.\par In order to analyze a wider range of realistic data streams (e.g., from IP traffic-monitoring applications), we generalize this problem to multiple dimensions. Here, the semantics of HHHs are more complex, since a ``child'' node can have multiple ``parent'' nodes. We present online algorithms that find approximate HHHs in one pass, with provable accuracy guarantees. The product of hierarchical dimensions forms a mathematical lattice structure. Our algorithms exploit this structure, and so are able to track approximate HHHs using only a small, fixed number of statistics per stored item, regardless of the number of dimensions.\par We show experimentally, using real data, that our proposed algorithms yields outputs which are very similar (virtually identical, in many cases) to offline computations of the exact solutions, whereas straightforward heavy-hitters-based approaches give significantly inferior answer quality. Furthermore, the proposed algorithms result in an order of magnitude savings in data structure size while performing competitively.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "approximation algorithms; Data mining; network data analysis", } @Article{Somaiya:2008:LCU, author = "Manas Somaiya and Christopher Jermaine and Sanjay Ranka", title = "Learning correlations using the mixture-of-subsets model", journal = j-TKDD, volume = "1", number = "4", pages = "3:1--3:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324175", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Using a mixture of random variables to model data is a tried-and-tested method common in data mining, machine learning, and statistics. By using mixture modeling it is often possible to accurately model even complex, multimodal data via very simple components. However, the classical mixture model assumes that a data point is generated by a single component in the model. A lot of datasets can be modeled closer to the underlying reality if we drop this restriction. We propose a probabilistic framework, the {\em mixture-of-subsets (MOS) model}, by making two fundamental changes to the classical mixture model. First, we allow a data point to be generated by a set of components, rather than just a single component. Next, we limit the number of data attributes that each component can influence. We also propose an EM framework to learn the MOS model from a dataset, and experimentally evaluate it on real, high-dimensional datasets. Our results show that the MOS model learned from the data represents the underlying nature of the data accurately.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "EM algorithm; high-dimensional data; Mixture modeling", } @Article{Halkidi:2008:CFB, author = "M. Halkidi and D. Gunopulos and M. Vazirgiannis and N. Kumar and C. Domeniconi", title = "A clustering framework based on subjective and objective validity criteria", journal = j-TKDD, volume = "1", number = "4", pages = "4:1--4:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1324172.1324176", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Clustering, as an unsupervised learning process is a challenging problem, especially in cases of high-dimensional datasets. Clustering result quality can benefit from user constraints and objective validity assessment. In this article, we propose a semisupervised framework for learning the weighted Euclidean subspace, where the best clustering can be achieved. Our approach capitalizes on: (i) user constraints; and (ii) the quality of intermediate clustering results in terms of their structural properties. The proposed framework uses the clustering algorithm and the validity measure as its parameters. We develop and discuss algorithms for learning and tuning the weights of contributing dimensions and defining the ``best'' clustering obtained by satisfying user constraints. Experimental results on benchmark datasets demonstrate the superiority of the proposed approach in terms of improved clustering accuracy.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "cluster validity; data mining; Semisupervised learning; similarity measure learning; space learning", } @Article{Zaki:2008:ISI, author = "Mohammed J. Zaki and George Karypis and Jiong Yang and Wei Wang", title = "Introduction to special issue on bioinformatics", journal = j-TKDD, volume = "2", number = "1", pages = "1:1--1:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342321", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jin:2008:CMM, author = "Ying Jin and T. M. Murali and Naren Ramakrishnan", title = "Compositional mining of multirelational biological datasets", journal = j-TKDD, volume = "2", number = "1", pages = "2:1--2:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342322", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "High-throughput biological screens are yielding ever-growing streams of information about multiple aspects of cellular activity. As more and more categories of datasets come online, there is a corresponding multitude of ways in which inferences can be chained across them, motivating the need for compositional data mining algorithms. In this article, we argue that such compositional data mining can be effectively realized by functionally cascading redescription mining and biclustering algorithms as primitives. Both these primitives mirror shifts of vocabulary that can be composed in arbitrary ways to create rich chains of inferences. Given a relational database and its schema, we show how the schema can be automatically compiled into a compositional data mining program, and how different domains in the schema can be related through logical sequences of biclustering and redescription invocations. This feature allows us to rapidly prototype new data mining applications, yielding greater understanding of scientific datasets. We describe two applications of compositional data mining: (i) matching terms across categories of the Gene Ontology and (ii) understanding the molecular mechanisms underlying stress response in human cells.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Biclustering; bioinformatics; compositional data mining; inductive logic programming; redescription mining", } @Article{Sahay:2008:DSB, author = "Saurav Sahay and Sougata Mukherjea and Eugene Agichtein and Ernest V. Garcia and Shamkant B. Navathe and Ashwin Ram", title = "Discovering semantic biomedical relations utilizing the {Web}", journal = j-TKDD, volume = "2", number = "1", pages = "3:1--3:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342323", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "To realize the vision of a Semantic Web for Life Sciences, discovering relations between resources is essential. It is very difficult to automatically extract relations from Web pages expressed in natural language formats. On the other hand, because of the explosive growth of information, it is difficult to manually extract the relations. In this paper we present techniques to automatically discover relations between biomedical resources from the Web. For this purpose we retrieve relevant information from Web Search engines and Pubmed database using various lexico-syntactic patterns as queries over SOAP web services. The patterns are initially handcrafted but can be progressively learnt. The extracted relations can be used to construct and augment ontologies and knowledge bases. Experiments are presented for general biomedical relation discovery and domain specific search to show the usefulness of our technique.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Ontology construction; relation identification", } @Article{Ye:2008:DSA, author = "Jieping Ye and Jianhui Chen and Ravi Janardan and Sudhir Kumar", title = "Developmental stage annotation of {Drosophila} gene expression pattern images via an entire solution path for {LDA}", journal = j-TKDD, volume = "2", number = "1", pages = "4:1--4:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342324", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/string-matching.bib; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Gene expression in a developing embryo occurs in particular cells (spatial patterns) in a time-specific manner (temporal patterns), which leads to the differentiation of cell fates. Images of a {\em Drosophila melanogaster\/} embryo at a given developmental stage, showing a particular gene expression pattern revealed by a gene-specific probe, can be compared for spatial overlaps. The comparison is fundamentally important to formulating and testing gene interaction hypotheses. Expression pattern comparison is most biologically meaningful when images from a similar time point (developmental stage) are compared. In this paper, we present LdaPath, a novel formulation of Linear Discriminant Analysis (LDA) for automatic developmental stage range classification. It employs multivariate linear regression with the {$ L_1 $}-norm penalty controlled by a regularization parameter for feature extraction and visualization. LdaPath computes an entire solution path for all values of regularization parameter with essentially the same computational cost as fitting one LDA model. Thus, it facilitates efficient model selection. It is based on the equivalence relationship between LDA and the least squares method for multiclass classifications. This equivalence relationship is established under a mild condition, which we show empirically to hold for many high-dimensional datasets, such as expression pattern images. Our experiments on a collection of 2705 expression pattern images show the effectiveness of the proposed algorithm. Results also show that the LDA model resulting from LdaPath is sparse, and irrelevant features may be removed. Thus, LdaPath provides a general framework for simultaneous feature selection and feature extraction.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "dimensionality reduction; Gene expression pattern image; linear discriminant analysis; linear regression", } @Article{Lu:2008:ADA, author = "Yijuan Lu and Qi Tian and Jennifer Neary and Feng Liu and Yufeng Wang", title = "Adaptive discriminant analysis for microarray-based classification", journal = j-TKDD, volume = "2", number = "1", pages = "5:1--5:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342325", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Microarray technology has generated enormous amounts of high-dimensional gene expression data, providing a unique platform for exploring gene regulatory networks. However, the curse of dimensionality plagues effort to analyze these high throughput data. Linear Discriminant Analysis (LDA) and Biased Discriminant Analysis (BDA) are two popular techniques for dimension reduction, which pay attention to different roles of the positive and negative samples in finding discriminating subspace. However, the drawbacks of these two methods are obvious: LDA has limited efficiency in classifying sample data from subclasses with different distributions, and BDA does not account for the underlying distribution of negative samples.\par In this paper, we propose a novel dimension reduction technique for microarray analysis: Adaptive Discriminant Analysis (ADA), which effectively exploits favorable attributes of both BDA and LDA and avoids their unfavorable ones. ADA can find a good discriminative subspace with adaptation to different sample distributions. It not only alleviates the problem of high dimensionality, but also enhances the classification performance in the subspace with na{\"\i}ve Bayes classifier. To learn the best model fitting the real scenario, boosted Adaptive Discriminant Analysis is further proposed. Extensive experiments on the yeast cell cycle regulation data set, and the expression data of the red blood cell cycle in malaria parasite {\em Plasmodium falciparum\/} demonstrate the superior performance of ADA and boosted ADA. We also present some putative genes of specific functional classes predicted by boosted ADA. Their potential functionality is confirmed by independent predictions based on Gene Ontology, demonstrating that ADA and boosted ADA are effective dimension reduction methods for microarray-based classification.", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "ADA; BDA; boosted ADA; dimension reduction; LDA; microarray", } @Article{Hashimoto:2008:NEP, author = "Kosuke Hashimoto and Kiyoko Flora Aoki-Kinoshita and Nobuhisa Ueda and Minoru Kanehisa and Hiroshi Mamitsuka", title = "A new efficient probabilistic model for mining labeled ordered trees applied to glycobiology", journal = j-TKDD, volume = "2", number = "1", pages = "6:1--6:??", month = mar, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1342320.1342326", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:18 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Mining frequent patterns from large datasets is an important issue in data mining. Recently, complex and unstructured (or semi-structured) datasets have appeared as targets for major data mining applications, including text mining, web mining and bioinformatics. Our work focuses on labeled ordered trees, which are typically semi-structured datasets. In bioinformatics, carbohydrate sugar chains, or glycans, can be modeled as labeled ordered trees. Glycans are the third major class of biomolecules, having important roles in signaling and recognition. For mining labeled ordered trees, we propose a new probabilistic model and its efficient learning scheme which significantly improves the time and space complexity of an existing probabilistic model for labeled ordered trees. We evaluated the performance of the proposed model, comparing it with those of other probabilistic models, using synthetic as well as real datasets from glycobiology. Experimental results showed that the proposed model drastically reduced the computation time of the competing model, keeping the predictive power and avoiding overfitting to the training data. Finally, we assessed our results on real data from a variety of biological viewpoints, verifying known facts in glycobiology.", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Expectation-maximization; labeled ordered trees; maximum likelihood; probabilistic models", } @Article{Ge:2008:JCA, author = "Rong Ge and Martin Ester and Byron J. Gao and Zengjian Hu and Binay Bhattacharya and Boaz Ben-Moshe", title = "Joint cluster analysis of attribute data and relationship data: {The} connected $k$-center problem, algorithms and applications", journal = j-TKDD, volume = "2", number = "2", pages = "7:1--7:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376816", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Attribute data and relationship data are two principal types of data, representing the intrinsic and extrinsic properties of entities. While attribute data have been the main source of data for cluster analysis, relationship data such as social networks or metabolic networks are becoming increasingly available. It is also common to observe both data types carry complementary information such as in market segmentation and community identification, which calls for a joint cluster analysis of both data types so as to achieve better results. In this article, we introduce the novel Connected $k$-Center ({\em CkC\/}) problem, a clustering model taking into account attribute data as well as relationship data. We analyze the complexity of the problem and prove its NP-hardness. Therefore, we analyze the approximability of the problem and also present a constant factor approximation algorithm. For the special case of the {\em CkC\/} problem where the relationship data form a tree structure, we propose a dynamic programming method giving an optimal solution in polynomial time. We further present NetScan, a heuristic algorithm that is efficient and effective for large real databases. Our extensive experimental evaluation on real datasets demonstrates the meaningfulness and accuracy of the NetScan results.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "approximation algorithms; Attribute data; community identification; document clustering; joint cluster analysis; market segmentation; NP-hardness; relationship data", } @Article{Gupta:2008:BBC, author = "Gunjan Gupta and Joydeep Ghosh", title = "{Bregman} bubble clustering: a robust framework for mining dense clusters", journal = j-TKDD, volume = "2", number = "2", pages = "8:1--8:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376817", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In classical clustering, each data point is assigned to at least one cluster. However, in many applications only a small subset of the available data is relevant for the problem and the rest needs to be ignored in order to obtain good clusters. Certain nonparametric density-based clustering methods find the most relevant data as multiple dense regions, but such methods are generally limited to low-dimensional data and do not scale well to large, high-dimensional datasets. Also, they use a specific notion of ``distance'', typically Euclidean or Mahalanobis distance, which further limits their applicability. On the other hand, the recent One Class Information Bottleneck (OC-IB) method is fast and works on a large class of distortion measures known as Bregman Divergences, but can only find a {\em single\/} dense region. This article presents a broad framework for finding $k$ dense clusters while ignoring the rest of the data. It includes a seeding algorithm that can automatically determine a suitable value for {\em k}. When $k$ is forced to 1, our method gives rise to an improved version of OC-IB with optimality guarantees. We provide a generative model that yields the proposed iterative algorithm for finding $k$ dense regions as a special case. Our analysis reveals an interesting and novel connection between the problem of finding dense regions and exponential mixture models; a hard model corresponding to $k$ exponential mixtures with a uniform background results in a set of $k$ dense clusters. The proposed method describes a highly scalable algorithm for finding multiple dense regions that works with any Bregman Divergence, thus extending density based clustering to a variety of non-Euclidean problems not addressable by earlier methods. We present empirical results on three artificial, two microarray and one text dataset to show the relevance and effectiveness of our methods.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Bregman divergences; Density-based clustering; expectation maximization; exponential family; One Class classification", } @Article{Tan:2008:TMG, author = "Henry Tan and Fedja Hadzic and Tharam S. Dillon and Elizabeth Chang and Ling Feng", title = "Tree model guided candidate generation for mining frequent subtrees from {XML} documents", journal = j-TKDD, volume = "2", number = "2", pages = "9:1--9:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376818", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Due to the inherent flexibilities in both structure and semantics, XML association rules mining faces few challenges, such as: a more complicated hierarchical data structure and ordered data context. Mining frequent patterns from XML documents can be recast as mining frequent tree structures from a database of XML documents. In this study, we model a database of XML documents as a database of rooted labeled ordered subtrees. In particular, we are mainly concerned with mining frequent induced and embedded ordered subtrees. Our main contributions are as follows. We describe our unique {\em embedding list\/} representation of the tree structure, which enables efficient implementation of our {\em Tree Model Guided\/} ({\em TMG\/}) candidate generation. {\em TMG\/} is an optimal, nonredundant enumeration strategy that enumerates all the valid candidates that conform to the structural aspects of the data. We show through a mathematical model and experiments that {\em TMG\/} has better complexity compared to the commonly used join approach. In this article, we propose two algorithms, MB3-Miner and iMB3-Miner. MB3-Miner mines embedded subtrees. iMB3-Miner mines induced and/or embedded subtrees by using the {\em maximum level of embedding constraint}. Our experiments with both synthetic and real datasets against two well-known algorithms for mining induced and embedded subtrees, demonstrate the effectiveness and the efficiency of the proposed techniques.", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "FREQT; TMG; Tree mining; tree model guided; TreeMiner", } @Article{Islam:2008:STS, author = "Aminul Islam and Diana Inkpen", title = "Semantic text similarity using corpus-based word similarity and string similarity", journal = j-TKDD, volume = "2", number = "2", pages = "10:1--10:??", month = jul, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1376815.1376819", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:30 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present a method for measuring the semantic similarity of texts using a corpus-based measure of semantic word similarity and a normalized and modified version of the Longest Common Subsequence (LCS) string matching algorithm. Existing methods for computing text similarity have focused mainly on either large documents or individual words. We focus on computing the similarity between two sentences or two short paragraphs. The proposed method can be exploited in a variety of applications involving textual knowledge representation and knowledge discovery. Evaluation results on two different data sets show that our method outperforms several competing methods.", acknowledgement = ack-nhfb, articleno = "10", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "corpus-based measures; Semantic similarity of words; similarity of short texts", } @Article{Sun:2008:ITA, author = "Jimeng Sun and Dacheng Tao and Spiros Papadimitriou and Philip S. Yu and Christos Faloutsos", title = "Incremental tensor analysis: {Theory} and applications", journal = j-TKDD, volume = "2", number = "3", pages = "11:1--11:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409621", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How do we find patterns in author-keyword associations, evolving over time? Or in data cubes (tensors), with product-branchcustomer sales information? And more generally, how to summarize high-order data cubes (tensors)? How to incrementally update these patterns over time? Matrix decompositions, like principal component analysis (PCA) and variants, are invaluable tools for mining, dimensionality reduction, feature selection, rule identification in numerous settings like streaming data, text, graphs, social networks, and many more settings. However, they have only two orders (i.e., matrices, like author and keyword in the previous example).\par We propose to envision such higher-order data as tensors, and tap the vast literature on the topic. However, these methods do not necessarily scale up, let alone operate on semi-infinite streams. Thus, we introduce a general framework, incremental tensor analysis (ITA), which efficiently computes a compact summary for high-order and high-dimensional data, and also reveals the hidden correlations. Three variants of ITA are presented: (1) dynamic tensor analysis (DTA); (2) streaming tensor analysis (STA); and (3) window-based tensor analysis (WTA). In particular, we explore several fundamental design trade-offs such as space efficiency, computational cost, approximation accuracy, time dependency, and model complexity.\par We implement all our methods and apply them in several real settings, such as network anomaly detection, multiway latent semantic indexing on citation networks, and correlation study on sensor measurements. Our empirical studies show that the proposed methods are fast and accurate and that they find interesting patterns and outliers on the real datasets.", acknowledgement = ack-nhfb, articleno = "11", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "multilinear algebra; stream mining; Tensor", } @Article{Mangasarian:2008:PPC, author = "Olvi L. Mangasarian and Edward W. Wild and Glenn M. Fung", title = "Privacy-preserving classification of vertically partitioned data via random kernels", journal = j-TKDD, volume = "2", number = "3", pages = "12:1--12:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409622", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We propose a novel privacy-preserving support vector machine (SVM) classifier for a data matrix $A$ whose input feature columns are divided into groups belonging to different entities. Each entity is unwilling to share its group of columns or make it public. Our classifier is based on the concept of a reduced kernel $k(A, B\prime)$, where $B\prime$ is the transpose of a random matrix $B$. The column blocks of $B$ corresponding to the different entities are privately generated by each entity and never made public. The proposed linear or nonlinear SVM classifier, which is public but does not reveal any of the privately held data, has accuracy comparable to that of an ordinary SVM classifier that uses the entire set of input features directly.", acknowledgement = ack-nhfb, articleno = "12", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Privacy preserving classification; support vector machines; vertically partitioned data", } @Article{Lakshmanan:2008:DRA, author = "Laks V. S. Lakshmanan and Raymond T. Ng and Ganesh Ramesh", title = "On disclosure risk analysis of anonymized itemsets in the presence of prior knowledge", journal = j-TKDD, volume = "2", number = "3", pages = "13:1--13:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409623", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Decision makers of companies often face the dilemma of whether to release data for knowledge discovery, vis-a-vis the risk of disclosing proprietary or sensitive information. Among the various methods employed for ``sanitizing'' the data prior to disclosure, we focus in this article on anonymization, given its widespread use in practice. We do due diligence to the question ``just how safe is the anonymized data?'' We consider both those scenarios when the hacker has no information and, more realistically, when the hacker may have partial information about items in the domain. We conduct our analyses in the context of frequent set mining and address the safety question at two different levels: (i) how likely of being cracked (i.e., re-identified by a hacker), are the identities of individual items and (ii) how likely are sets of items cracked? For capturing the prior knowledge of the hacker, we propose a {\em belief function}, which amounts to an educated guess of the frequency of each item. For various classes of belief functions which correspond to different degrees of prior knowledge, we derive formulas for computing the expected number of cracks of single items and for itemsets, the probability of cracking the itemsets. While obtaining, exact values for more general situations is computationally hard, we propose a series of heuristics called the {\em O-estimates}. They are easy to compute and are shown fairly accurate, justified by empirical results on real benchmark datasets. Based on the O-estimates, we propose a recipe for the decision makers to resolve their dilemma. Our recipe operates at two different levels, depending on whether the data owner wants to reason in terms of single items or sets of items (or both). Finally, we present techniques for ascertaining a hacker's knowledge of correlation in terms of co-occurrence of items likely. This information regarding the hacker's knowledge can be incorporated into our framework of disclosure risk analysis and we present experimental results demonstrating how this knowledge affects the heuristic estimates we have developed.", acknowledgement = ack-nhfb, articleno = "13", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "anonymization; belief function; bipartite graphs; correlation; Disclosure risk; frequent itemsets; hacker; matching; prior knowledge; sampling", } @Article{Vaidya:2008:PPD, author = "Jaideep Vaidya and Chris Clifton and Murat Kantarcioglu and A. Scott Patterson", title = "Privacy-preserving decision trees over vertically partitioned data", journal = j-TKDD, volume = "2", number = "3", pages = "14:1--14:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1409620.1409624", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:41 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Privacy and security concerns can prevent sharing of data, derailing data-mining projects. Distributed knowledge discovery, if done correctly, can alleviate this problem. We introduce a generalized privacy-preserving variant of the ID3 algorithm for vertically partitioned data distributed over two or more parties. Along with a proof of security, we discuss what would be necessary to make the protocols completely secure. We also provide experimental results, giving a first demonstration of the practical complexity of secure multiparty computation-based data mining.", acknowledgement = ack-nhfb, articleno = "14", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Decision tree classification; privacy", } @Article{Chuang:2009:FPS, author = "Kun-Ta Chuang and Hung-Leng Chen and Ming-Syan Chen", title = "Feature-preserved sampling over streaming data", journal = j-TKDD, volume = "2", number = "4", pages = "15:1--15:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460798", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article, we explore a novel sampling model, called {\em feature preserved sampling\/} ({\em FPS\/}) that sequentially generates a high-quality sample over sliding windows. The sampling quality we consider refers to the degree of consistency between the sample proportion and the population proportion of each attribute value in a window. Due to the time-variant nature of real-world datasets, users are more likely to be interested in the most recent data. However, previous works have not been able to generate a high-quality sample over sliding windows that precisely preserves up-to-date population characteristics. Motivated by this shortcoming, we have developed the {\em FPS\/} algorithm, which has several advantages: (1) it sequentially generates a sample from a time-variant data source over sliding windows; (2) the execution time of {\em FPS\/} is linear with respect to the database size; (3) the {\em relative\/} proportional differences between the sample proportions and population proportions of most distinct attribute values are guaranteed to be below a specified error threshold, $\epsilon$ , while the {\em relative\/} proportion differences of the remaining attribute values are as close to $\epsilon$ as possible, which ensures that the generated sample is of high quality; (4) the sample rate is close to the user specified rate so that a high quality sampling result can be obtained without increasing the sample size; (5) by a thorough analytical and empirical study, we prove that {\em FPS\/} has acceptable space overheads, especially when the attribute values have Zipfian distributions, and {\em FPS\/} can also excellently preserve the population proportion of multivariate features in the sample; and (6) {\em FPS\/} can be applied to infinite streams and finite datasets equally, and the generated samples can be used for various applications. Our experiments on both real and synthetic data validate that {\em FPS\/} can effectively obtain a high quality sample of the desired size. In addition, while using the sample generated by {\em FPS\/} in various mining applications, a significant improvement in efficiency can be achieved without compromising the model's precision.", acknowledgement = ack-nhfb, articleno = "15", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "sampling; Streaming mining", } @Article{Jiang:2009:MFC, author = "Daxin Jiang and Jian Pei", title = "Mining frequent cross-graph quasi-cliques", journal = j-TKDD, volume = "2", number = "4", pages = "16:1--16:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460799", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Joint mining of multiple datasets can often discover interesting, novel, and reliable patterns which cannot be obtained solely from any single source. For example, in bioinformatics, jointly mining multiple gene expression datasets obtained by different labs or during various biological processes may overcome the heavy noise in the data. Moreover, by joint mining of gene expression data and protein-protein interaction data, we may discover clusters of genes which show coherent expression patterns and also produce interacting proteins. Such clusters may be potential pathways.\par In this article, we investigate a novel data mining problem, {\em mining frequent cross-graph quasi-cliques}, which is generalized from several interesting applications in bioinformatics, cross-market customer segmentation, social network analysis, and Web mining. In a graph, a set of vertices $S$ is a $\gamma$-quasi-clique $(0 < \gamma \leq 1)$ if each vertex $v$ in $S$ directly connects to at least $\gamma \cdot (|S| - 1)$ other vertices in $S$. Given a set of graphs $G_1, \ldots{}, G_n$ and parameter ${\rm min\_sup} (0 < {\rm min\_sup} 1)$, a set of vertices $S$ is a frequent cross-graph quasi-clique if $S$ is a $\gamma$-quasi-clique in at least ${\rm min\_sup} \cdot n$ graphs, and there does not exist a proper superset of $S$ having the property.\par We build a general model, show why the complete set of frequent cross-graph quasi-cliques cannot be found by previous data mining methods, and study the complexity of the problem. While the problem is difficult, we develop practical algorithms which exploit several interesting and effective techniques and heuristics to efficaciously mine frequent cross-graph quasi-cliques. A systematic performance study is reported on both synthetic and real data sets. We demonstrate some interesting and meaningful frequent cross-graph quasi-cliques in bioinformatics. The experimental results also show that our algorithms are efficient and scalable.", acknowledgement = ack-nhfb, articleno = "16", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "bioinformatics; clique; Graph mining; joint mining", } @Article{Domeniconi:2009:WCE, author = "Carlotta Domeniconi and Muna Al-Razgan", title = "Weighted cluster ensembles: {Methods} and analysis", journal = j-TKDD, volume = "2", number = "4", pages = "17:1--17:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460800", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Cluster ensembles offer a solution to challenges inherent to clustering arising from its ill-posed nature. Cluster ensembles can provide robust and stable solutions by leveraging the consensus across multiple clustering results, while averaging out emergent spurious structures that arise due to the various biases to which each participating algorithm is tuned. In this article, we address the problem of combining multiple {\em weighted clusters\/} that belong to different subspaces of the input space. We leverage the diversity of the input clusterings in order to generate a consensus partition that is superior to the participating ones. Since we are dealing with weighted clusters, our consensus functions make use of the weight vectors associated with the clusters. We demonstrate the effectiveness of our techniques by running experiments with several real datasets, including high-dimensional text data. Furthermore, we investigate in depth the issue of diversity and accuracy for our ensemble methods. Our analysis and experimental results show that the proposed techniques are capable of producing a partition that is as good as or better than the best individual clustering.", acknowledgement = ack-nhfb, articleno = "17", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "accuracy and diversity measures; Cluster ensembles; consensus functions; data mining; subspace clustering; text data", } @Article{Zhang:2009:DGA, author = "Zhenjie Zhang and Laks V. S. Lakshmanan and Anthony K. H. Tung", title = "On domination game analysis for microeconomic data mining", journal = j-TKDD, volume = "2", number = "4", pages = "18:1--18:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1460797.1460801", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 17:59:51 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Game theory is a powerful tool for analyzing the competitions among manufacturers in a market. In this article, we present a study on combining game theory and data mining by introducing the concept of domination game analysis. We present a multidimensional market model, where every dimension represents one attribute of a commodity. Every product or customer is represented by a point in the multidimensional space, and a product is said to ``dominate'' a customer if all of its attributes can satisfy the requirements of the customer. The expected market share of a product is measured by the expected number of the buyers in the customers, all of which are equally likely to buy any product dominating him. A Nash equilibrium is a configuration of the products achieving stable expected market shares for all products. We prove that Nash equilibrium in such a model can be computed in polynomial time if every manufacturer tries to modify its product in a round robin manner. To further improve the efficiency of the computation, we also design two algorithms for the manufacturers to efficiently find their best response to other products in the market.", acknowledgement = ack-nhfb, articleno = "18", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "data mining; Domination game; game theory", } @Article{Kriegel:2009:CHD, author = "Hans-Peter Kriegel and Peer Kr{\"o}ger and Arthur Zimek", title = "Clustering high-dimensional data: {A} survey on subspace clustering, pattern-based clustering, and correlation clustering", journal = j-TKDD, volume = "3", number = "1", pages = "1:1--1:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497578", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "As a prolific research area in data mining, subspace clustering and related problems induced a vast quantity of proposed solutions. However, many publications compare a new proposition --- if at all --- with one or two competitors, or even with a so-called ``na{\"\i}ve'' ad hoc solution, but fail to clarify the exact problem definition. As a consequence, even if two solutions are thoroughly compared experimentally, it will often remain unclear whether both solutions tackle the same problem or, if they do, whether they agree in certain tacit assumptions and how such assumptions may influence the outcome of an algorithm. In this survey, we try to clarify: (i) the different problem definitions related to subspace clustering in general; (ii) the specific difficulties encountered in this field of research; (iii) the varying assumptions, heuristics, and intuitions forming the basis of different approaches; and (iv) how several prominent solutions tackle different problems.", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "clustering; high-dimensional data; Survey", } @Article{Dhurandhar:2009:SAM, author = "Amit Dhurandhar and Alin Dobra", title = "Semi-analytical method for analyzing models and model selection measures based on moment analysis", journal = j-TKDD, volume = "3", number = "1", pages = "2:1--2:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497579", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article we propose a moment-based method for studying models and model selection measures. By focusing on the probabilistic space of classifiers induced by the classification algorithm rather than on that of datasets, we obtain efficient characterizations for computing the moments, which is followed by visualization of the resulting formulae that are too complicated for direct interpretation. By assuming the data to be drawn independently and identically distributed from the underlying probability distribution, and by going over the space of all possible datasets, we establish general relationships between the generalization error, hold-out-set error, cross-validation error, and leave-one-out error. We later exemplify the method and the results by studying the behavior of the errors for the naive Bayes classifier.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "classification; generalization error; Model selection", } @Article{Cerf:2009:CPM, author = "Lo{\"\i}c Cerf and J{\'e}r{\'e}my Besson and C{\'e}line Robardet and Jean-Fran{\c{c}}ois Boulicaut", title = "Closed patterns meet $n$-ary relations", journal = j-TKDD, volume = "3", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497580", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Set pattern discovery from binary relations has been extensively studied during the last decade. In particular, many complete and efficient algorithms for frequent closed set mining are now available. Generalizing such a task to $n$-ary relations ($n \geq 2$) appears as a timely challenge. It may be important for many applications, for example, when adding the time dimension to the popular {\em objects\/} $\times$ {\em features\/} binary case. The generality of the task (no assumption being made on the relation arity or on the size of its attribute domains) makes it computationally challenging. We introduce an algorithm called Data-Peeler. From an $n$-ary relation, it extracts all closed $n$-sets satisfying given piecewise (anti) monotonic constraints. This new class of constraints generalizes both monotonic and antimonotonic constraints. Considering the special case of ternary relations, Data-Peeler outperforms the state-of-the-art algorithms CubeMiner and Trias by orders of magnitude. These good performances must be granted to a new clever enumeration strategy allowing to efficiently enforce the closeness property. The relevance of the extracted closed $n$-sets is assessed on real-life 3-and 4-ary relations. Beyond natural 3-or 4-ary relations, expanding a relation with an additional attribute can help in enforcing rather abstract constraints such as the robustness with respect to binarization. Furthermore, a collection of closed $n$-sets is shown to be an excellent starting point to compute a tiling of the dataset.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "$n$-ary relations; Closed patterns; constraint properties; constraint-based mining; tiling", } @Article{Angiulli:2009:DEA, author = "Fabrizio Angiulli and Fabio Fassetti", title = "{DOLPHIN}: {An} efficient algorithm for mining distance-based outliers in very large datasets", journal = j-TKDD, volume = "3", number = "1", pages = "4:1--4:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497581", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this work a novel distance-based outlier detection algorithm, named DOLPHIN, working on disk-resident datasets and whose I/O cost corresponds to the cost of sequentially reading the input dataset file twice, is presented.\par It is both theoretically and empirically shown that the main memory usage of DOLPHIN amounts to a small fraction of the dataset and that DOLPHIN has linear time performance with respect to the dataset size. DOLPHIN gains efficiency by naturally merging together in a unified schema three strategies, namely the selection policy of objects to be maintained in main memory, usage of pruning rules, and similarity search techniques. Importantly, similarity search is accomplished by the algorithm without the need of preliminarily indexing the whole dataset, as other methods do.\par The algorithm is simple to implement and it can be used with any type of data, belonging to either metric or nonmetric spaces. Moreover, a modification to the basic method allows DOLPHIN to deal with the scenario in which the available buffer of main memory is smaller than its standard requirements. DOLPHIN has been compared with state-of-the-art distance-based outlier detection algorithms, showing that it is much more efficient.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Data mining; distance-based outliers; outlier detection", } @Article{Chen:2009:BAS, author = "Bee-Chung Chen and Raghu Ramakrishnan and Jude W. Shavlik and Pradeep Tamma", title = "Bellwether analysis: {Searching} for cost-effective query-defined predictors in large databases", journal = j-TKDD, volume = "3", number = "1", pages = "5:1--5:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1497577.1497582", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:01 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How to mine massive datasets is a challenging problem with great potential value. Motivated by this challenge, much effort has concentrated on developing scalable versions of machine learning algorithms. However, the cost of mining large datasets is not just computational; preparing the datasets into the ``right form'' so that learning algorithms can be applied is usually costly, due to the human labor that is typically required and a large number of choices in data preparation, which include selecting different subsets of data and aggregating data at different granularities. We make the key observation that, for a number of practically motivated problems, these choices can be defined using database queries and analyzed in an automatic and systematic manner. Specifically, we propose a new class of data-mining problem, called {\em bellwether analysis}, in which the goal is to find a few query-defined predictors (e.g., first week sales of Peoria, IL of an item) that can be used to accurately predict the result of a target query (e.g., first year worldwide sales of the item) from a large number of queries that define candidate predictors. To make a prediction for a new item, the data needed to generate such predictors has to be collected (e.g., selling the new item in Peoria, IL for a week and collecting the sales data). A useful predictor is one that has high prediction accuracy and a low data-collection cost. We call such a cost-effective predictor a {\em bellwether}.\par This article introduces bellwether analysis, which integrates database query processing and predictive modeling into a single framework, and provides scalable algorithms for large datasets that cannot fit in main memory. Through a series of extensive experiments, we show that bellwethers do exist in real-world databases, and that our computation techniques achieve good efficiency on large datasets.", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "bellwether; Cost-effective prediction; data cube; OLAP queries; predictive models; scalable algorithms", } @Article{Liu:2009:ISI, author = "Huan Liu and John Salerno and Michael Young and Rakesh Agrawal and Philip S. Yu", title = "Introduction to special issue on social computing, behavioral modeling, and prediction", journal = j-TKDD, volume = "3", number = "2", pages = "6:1--6:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514889", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mehler:2009:ENC, author = "Andrew Mehler and Steven Skiena", title = "Expanding network communities from representative examples", journal = j-TKDD, volume = "3", number = "2", pages = "7:1--7:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514890", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present an approach to leverage a small subset of a coherent community within a social network into a much larger, more representative sample. Our problem becomes identifying a small conductance subgraph containing many (but not necessarily all) members of the given seed set. Starting with an initial seed set representing a sample of a community, we seek to discover as much of the full community as possible.\par We present a general method for network community expansion, demonstrating that our methods work well in expanding communities in real world networks starting from small given seed groups (20 to 400 members). Our approach is marked by incremental expansion from the seeds with retrospective analysis to determine the ultimate boundaries of our community. We demonstrate how to increase the robustness of the general approach through bootstrapping multiple random partitions of the input set into seed and evaluation groups.\par We go beyond statistical comparisons against gold standards to careful subjective evaluations of our expanded communities. This process explains the causes of most disagreement between our expanded communities and our gold-standards --- arguing that our expansion methods provide more reliable communities than can be extracted from reference sources/gazetteers such as Wikipedia.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "artificial intelligence; community discovery; Discrete mathematics; graph theory; news analysis; social networks", } @Article{Lin:2009:ACT, author = "Yu-Ru Lin and Yun Chi and Shenghuo Zhu and Hari Sundaram and Belle L. Tseng", title = "Analyzing communities and their evolutions in dynamic social networks", journal = j-TKDD, volume = "3", number = "2", pages = "8:1--8:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514891", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We discover communities from social network data and analyze the community evolution. These communities are inherent characteristics of human interaction in online social networks, as well as paper citation networks. Also, communities may evolve over time, due to changes to individuals' roles and social status in the network as well as changes to individuals' research interests. We present an innovative algorithm that deviates from the traditional two-step approach to analyze community evolutions. In the traditional approach, communities are first detected for each time slice, and then compared to determine correspondences. We argue that this approach is inappropriate in applications with noisy data. In this paper, we propose {\em FacetNet\/} for analyzing communities and their evolutions through a robust {\em unified\/} process. This novel framework will discover communities and capture their evolution with temporal smoothness given by historic community structures. Our approach relies on formulating the problem in terms of maximum a posteriori (MAP) estimation, where the community structure is estimated both by the observed networked data and by the prior distribution given by historic community structures. Then we develop an iterative algorithm, with proven low time complexity, which is guaranteed to converge to an optimal solution. We perform extensive experimental studies, on both synthetic datasets and real datasets, to demonstrate that our method discovers meaningful communities and provides additional insights not directly obtainable from traditional methods.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Community; community net; evolution; evolution net; nonnegative matrix factorization; soft membership", } @Article{Kimura:2009:BLM, author = "Masahiro Kimura and Kazumi Saito and Hiroshi Motoda", title = "Blocking links to minimize contamination spread in a social network", journal = j-TKDD, volume = "3", number = "2", pages = "9:1--9:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514892", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We address the problem of minimizing the propagation of undesirable things, such as computer viruses or malicious rumors, by blocking a limited number of links in a network, which is converse to the influence maximization problem in which the most influential nodes for information diffusion is searched in a social network. This minimization problem is more fundamental than the problem of preventing the spread of contamination by removing nodes in a network. We introduce two definitions for the contamination degree of a network, accordingly define two contamination minimization problems, and propose methods for efficiently finding good approximate solutions to these problems on the basis of a naturally greedy strategy. Using large social networks, we experimentally demonstrate that the proposed methods outperform conventional link-removal methods. We also show that unlike the case of blocking a limited number of nodes, the strategy of removing nodes with high out-degrees is not necessarily effective for these problems.", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Contamination diffusion; link analysis; social networks", } @Article{Agichtein:2009:MIS, author = "Eugene Agichtein and Yandong Liu and Jiang Bian", title = "Modeling information-seeker satisfaction in community question answering", journal = j-TKDD, volume = "3", number = "2", pages = "10:1--10:??", month = apr, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1514888.1514893", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Fri Apr 24 18:00:12 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Question Answering Communities such as Naver, Baidu Knows, and Yahoo! Answers have emerged as popular, and often effective, means of information seeking on the web. By posting questions for other participants to answer, information seekers can obtain specific answers to their questions. Users of CQA portals have already contributed millions of questions, and received hundreds of millions of answers from other participants. However, CQA is not always effective: in some cases, a user may obtain a perfect answer within minutes, and in others it may require hours --- and sometimes days --- until a satisfactory answer is contributed. We investigate the problem of predicting information seeker satisfaction in collaborative question answering communities, where we attempt to predict whether a question author will be satisfied with the answers submitted by the community participants. We present a general prediction model, and develop a variety of content, structure, and community-focused features for this task. Our experimental results, obtained from a large-scale evaluation over thousands of real questions and user ratings, demonstrate the feasibility of modeling and predicting asker satisfaction. We complement our results with a thorough investigation of the interactions and information seeking patterns in question answering communities that correlate with information seeker satisfaction. We also explore {\em personalized\/} models of asker satisfaction, and show that when sufficient interaction history exists, personalization can significantly improve prediction accuracy over a ``one-size-fits-all'' model. Our models and predictions could be useful for a variety of applications, such as user intent inference, answer ranking, interface design, and query suggestion and routing.", acknowledgement = ack-nhfb, articleno = "10", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Community question answering; information seeker satisfaction", } @Article{Torvik:2009:AND, author = "Vetle I. Torvik and Neil R. Smalheiser", title = "Author name disambiguation in {MEDLINE}", journal = j-TKDD, volume = "3", number = "3", pages = "11:1--11:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552304", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "{\em Background\/}: We recently described ``Author-ity,'' a model for estimating the probability that two articles in MEDLINE, sharing the same author name, were written by the same individual. Features include shared title words, journal name, coauthors, medical subject headings, language, affiliations, and author name features (middle initial, suffix, and prevalence in MEDLINE). Here we test the hypothesis that the Author-ity model will suffice to disambiguate author names for the vast majority of articles in MEDLINE. {\em Methods\/}: Enhancements include: (a) incorporating first names and their variants, email addresses, and correlations between specific last names and affiliation words; (b) new methods of generating large unbiased training sets; (c) new methods for estimating the prior probability; (d) a weighted least squares algorithm for correcting transitivity violations; and (e) a maximum likelihood based agglomerative algorithm for computing clusters of articles that represent inferred author-individuals. {\em Results\/}: Pairwise comparisons were computed for all author names on all 15.3 million articles in MEDLINE (2006 baseline), that share last name and first initial, to create Author-ity 2006, a database that has each name on each article assigned to one of 6.7 million inferred author-individual clusters. Recall is estimated at $\approx 98.8\%$. Lumping (putting two different individuals into the same cluster) affects $\approx 0.5\%$ of clusters, whereas splitting (assigning articles written by the same individual to $> 1$ cluster) affects $\approx 2\%$ of articles. {\em Impact\/}: The Author-ity model can be applied generally to other bibliographic databases. Author name disambiguation allows information retrieval and data integration to become {\em person-centered}, not just {\em document-centered}, setting the stage for new data mining and social network tools that will facilitate the analysis of scholarly publishing and collaboration behavior. {\em Availability\/}: The Author-ity 2006 database is available for nonprofit academic research, and can be freely queried via http://arrowsmith.psych.uic.edu.", acknowledgement = ack-nhfb, articleno = "11", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "bibliographic databases; Name disambiguation", } @Article{Tu:2009:SDC, author = "Li Tu and Yixin Chen", title = "Stream data clustering based on grid density and attraction", journal = j-TKDD, volume = "3", number = "3", pages = "12:1--12:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552305", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Clustering real-time stream data is an important and challenging problem. Existing algorithms such as CluStream are based on the {\em k\/} -means algorithm. These clustering algorithms have difficulties finding clusters of arbitrary shapes and handling outliers. Further, they require the knowledge of {\em k\/} and user-specified time window. To address these issues, this article proposes {\em D-Stream}, a framework for clustering stream data using a density-based approach.\par Our algorithm uses an online component that maps each input data record into a grid and an offline component that computes the grid density and clusters the grids based on the density. The algorithm adopts a density decaying technique to capture the dynamic changes of a data stream and a attraction-based mechanism to accurately generate cluster boundaries.\par Exploiting the intricate relationships among the decay factor, attraction, data density, and cluster structure, our algorithm can efficiently and effectively generate and adjust the clusters in real time. Further, a theoretically sound technique is developed to detect and remove sporadic grids mapped by outliers in order to dramatically improve the space and time efficiency of the system. The technique makes high-speed data stream clustering feasible without degrading the clustering quality. The experimental results show that our algorithm has superior quality and efficiency, can find clusters of arbitrary shapes, and can accurately recognize the evolving behaviors of real-time data streams.", acknowledgement = ack-nhfb, articleno = "12", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "clustering; data mining; density-based algorithms; Stream data", } @Article{Zhou:2009:LST, author = "Bin Zhou and Jian Pei", title = "Link spam target detection using page farms", journal = j-TKDD, volume = "3", number = "3", pages = "13:1--13:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552306", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Currently, most popular Web search engines adopt some link-based ranking methods such as PageRank. Driven by the huge potential benefit of improving rankings of Web pages, many tricks have been attempted to boost page rankings. The most common way, which is known as link spam, is to make up some artificially designed link structures. Detecting link spam effectively is a big challenge. In this article, we develop novel and effective detection methods for link spam target pages using page farms. The essential idea is intuitive: whether a page is the beneficiary of link spam is reflected by how it collects its PageRank score. Technically, how a target page collects its PageRank score is modeled by a page farm, which consists of pages contributing a major portion of the PageRank score of the target page. We propose two spamicity measures based on page farms. They can be used as an effective measure to check whether the pages are link spam target pages. An empirical study using a newly available real dataset strongly suggests that our method is effective. It outperforms the state-of-the-art methods like SpamRank and SpamMass in both precision and recall.", acknowledgement = ack-nhfb, articleno = "13", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Link Spam; Page Farm; PageRank", } @Article{Wan:2009:DBC, author = "Li Wan and Wee Keong Ng and Xuan Hong Dang and Philip S. Yu and Kuan Zhang", title = "Density-based clustering of data streams at multiple resolutions", journal = j-TKDD, volume = "3", number = "3", pages = "14:1--14:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1552303.1552307", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:36:58 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In data stream clustering, it is desirable to have algorithms that are able to detect clusters of arbitrary shape, clusters that evolve over time, and clusters with noise. Existing stream data clustering algorithms are generally based on an online-offline approach: The online component captures synopsis information from the data stream (thus, overcoming real-time and memory constraints) and the offline component generates clusters using the stored synopsis. The online-offline approach affects the overall performance of stream data clustering in various ways: the ease of deriving synopsis from streaming data; the complexity of data structure for storing and managing synopsis; and the frequency at which the offline component is used to generate clusters. In this article, we propose an algorithm that (1) computes and updates synopsis information in constant time; (2) allows users to discover clusters at multiple resolutions; (3) determines the right time for users to generate clusters from the synopsis information; (4) generates clusters of higher purity than existing algorithms; and (5) determines the right threshold function for density-based clustering based on the fading model of stream data. To the best of our knowledge, no existing data stream algorithms has all of these features. Experimental results show that our algorithm is able to detect arbitrarily shaped, evolving clusters with high quality.", acknowledgement = ack-nhfb, articleno = "14", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Data mining algorithms; density based clustering; evolving data streams", } @Article{Mannila:2009:ATS, author = "Heikki Mannila and Dimitrios Gunopulos", title = "{ACM TKDD} special issue {ACM SIGKDD 2007} and {ACM SIGKDD 2008}", journal = j-TKDD, volume = "3", number = "4", pages = "15:1--15:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631163", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Asur:2009:EBF, author = "Sitaram Asur and Srinivasan Parthasarathy and Duygu Ucar", title = "An event-based framework for characterizing the evolutionary behavior of interaction graphs", journal = j-TKDD, volume = "3", number = "4", pages = "16:1--16:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631164", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Interaction graphs are ubiquitous in many fields such as bioinformatics, sociology and physical sciences. There have been many studies in the literature targeted at studying and mining these graphs. However, almost all of them have studied these graphs from a static point of view. The study of the evolution of these graphs over time can provide tremendous insight on the behavior of entities, communities and the flow of information among them. In this work, we present an event-based characterization of critical behavioral patterns for temporally varying interaction graphs. We use nonoverlapping snapshots of interaction graphs and develop a framework for capturing and identifying interesting events from them. We use these events to characterize complex behavioral patterns of individuals and communities over time. We show how semantic information can be incorporated to reason about community-behavior events. We also demonstrate the application of behavioral patterns for the purposes of modeling evolution, link prediction and influence maximization. Finally, we present a diffusion model for evolving networks, based on our framework.", acknowledgement = ack-nhfb, articleno = "16", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "diffusion of innovations; Dynamic interaction networks; evolutionary analysis", } @Article{Chi:2009:ESC, author = "Yun Chi and Xiaodan Song and Dengyong Zhou and Koji Hino and Belle L. Tseng", title = "On evolutionary spectral clustering", journal = j-TKDD, volume = "3", number = "4", pages = "17:1--17:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631165", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Evolutionary clustering is an emerging research area essential to important applications such as clustering dynamic Web and blog contents and clustering data streams. In evolutionary clustering, a good clustering result should fit the current data well, while simultaneously not deviate too dramatically from the recent history. To fulfill this dual purpose, a measure of {\em temporal smoothness\/} is integrated in the overall measure of clustering quality. In this article, we propose two frameworks that incorporate temporal smoothness in evolutionary spectral clustering. For both frameworks, we start with intuitions gained from the well-known {\em k\/} -means clustering problem, and then propose and solve corresponding cost functions for the evolutionary spectral clustering problems. Our solutions to the evolutionary spectral clustering problems provide more stable and consistent clustering results that are less sensitive to short-term noises while at the same time are adaptive to long-term cluster drifts. Furthermore, we demonstrate that our methods provide the optimal solutions to the relaxed versions of the corresponding evolutionary {\em k\/} -means clustering problems. Performance experiments over a number of real and synthetic data sets illustrate our evolutionary spectral clustering methods provide more robust clustering results that are not sensitive to noise and can adapt to data drifts.", acknowledgement = ack-nhfb, articleno = "17", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Evolutionary spectral clustering; preserving cluster membership; preserving cluster quality; temporal smoothness", } @Article{Fujiwara:2009:FLS, author = "Yasuhiro Fujiwara and Yasushi Sakurai and Masaru Kitsuregawa", title = "Fast likelihood search for hidden {Markov} models", journal = j-TKDD, volume = "3", number = "4", pages = "18:1--18:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631166", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Hidden Markov models (HMMs) are receiving considerable attention in various communities and many applications that use HMMs have emerged such as mental task classification, biological analysis, traffic monitoring, and anomaly detection. This article has two goals; The first goal is exact and efficient identification of the model whose state sequence has the highest likelihood for the given query sequence (more precisely, no HMM that actually has a high-probability path for the given sequence is missed by the algorithm), and the second goal is exact and efficient monitoring of streaming data sequences to find the best model. We propose SPIRAL, a fast search method for HMM datasets. SPIRAL is based on three ideas; (1) it clusters states of models to compute approximate likelihood, (2) it uses several granularities and approximates likelihood values in search processing, and (3) it focuses on just the promising likelihood computations by pruning out low-likelihood state sequences. Experiments verify the effectiveness of SPIRAL and show that it is more than 490 times faster than the naive method.", acknowledgement = ack-nhfb, articleno = "18", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Hidden Markov model; likelihood; upper bound", } @Article{Zhang:2009:EAG, author = "Xiang Zhang and Fei Zou and Wei Wang", title = "Efficient algorithms for genome-wide association study", journal = j-TKDD, volume = "3", number = "4", pages = "19:1--19:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631167", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Studying the association between quantitative phenotype (such as height or weight) and single nucleotide polymorphisms (SNPs) is an important problem in biology. To understand underlying mechanisms of complex phenotypes, it is often necessary to consider joint genetic effects across multiple SNPs. ANOVA (analysis of variance) test is routinely used in association study. Important findings from studying gene-gene (SNP-pair) interactions are appearing in the literature. However, the number of SNPs can be up to millions. Evaluating joint effects of SNPs is a challenging task even for SNP-pairs. Moreover, with large number of SNPs correlated, permutation procedure is preferred over simple Bonferroni correction for properly controlling family-wise error rate and retaining mapping power, which dramatically increases the computational cost of association study.\par In this article, we study the problem of finding SNP-pairs that have significant associations with a given quantitative phenotype. We propose an efficient algorithm, FastANOVA, for performing ANOVA tests on SNP-pairs in a batch mode, which also supports large permutation test. We derive an upper bound of SNP-pair ANOVA test, which can be expressed as the sum of two terms. The first term is based on single-SNP ANOVA test. The second term is based on the SNPs and independent of any phenotype permutation. Furthermore, SNP-pairs can be organized into groups, each of which shares a common upper bound. This allows for maximum reuse of intermediate computation, efficient upper bound estimation, and effective SNP-pair pruning. Consequently, FastANOVA only needs to perform the ANOVA test on a small number of candidate SNP-pairs without the risk of missing any significant ones. Extensive experiments demonstrate that FastANOVA is orders of magnitude faster than the brute-force implementation of ANOVA tests on all SNP pairs. The principles used in FastANOVA can be applied to categorical phenotypes and other statistics such as Chi-square test.", acknowledgement = ack-nhfb, articleno = "19", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "ANOVA test; Association study; permutation test", } @Article{Bilgic:2009:RCM, author = "Mustafa Bilgic and Lise Getoor", title = "Reflect and correct: {A} misclassification prediction approach to active inference", journal = j-TKDD, volume = "3", number = "4", pages = "20:1--20:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631168", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Information diffusion, viral marketing, graph-based semi-supervised learning, and collective classification all attempt to model and exploit the relationships among nodes in a network to improve the performance of node labeling algorithms. However, sometimes the advantage of exploiting the relationships can become a disadvantage. Simple models like label propagation and iterative classification can aggravate a misclassification by propagating mistakes in the network, while more complex models that define and optimize a global objective function, such as Markov random fields and graph mincuts, can misclassify a set of nodes jointly. This problem can be mitigated if the classification system is allowed to ask for the correct labels for a few of the nodes during inference. However, determining the optimal set of labels to acquire is intractable under relatively general assumptions, which forces us to resort to approximate and heuristic techniques. We describe three such techniques in this article. The first one is based on directly approximating the value of the objective function of label acquisition and greedily acquiring the label that provides the most improvement. The second technique is a simple technique based on the analogy we draw between viral marketing and label acquisition. Finally, we propose a method, which we refer to as {\em reflect and correct}, that can learn and predict when the classification system is likely to make mistakes and suggests acquisitions to correct those mistakes. We empirically show on a variety of synthetic and real-world datasets that the reflect and correct method significantly outperforms the other two techniques, as well as other approaches based on network structural measures such as node degree and network clustering.", acknowledgement = ack-nhfb, articleno = "20", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Active inference; collective classification; information diffusion; label acquisition; viral marketing", } @Article{Kiernan:2009:CCS, author = "Jerry Kiernan and Evimaria Terzi", title = "Constructing comprehensive summaries of large event sequences", journal = j-TKDD, volume = "3", number = "4", pages = "21:1--21:??", month = nov, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1631162.1631169", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:13 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Event sequences capture system and user activity over time. Prior research on sequence mining has mostly focused on discovering local patterns appearing in a sequence. While interesting, these patterns do not give a comprehensive summary of the entire event sequence. Moreover, the number of patterns discovered can be large. In this article, we take an alternative approach and build {\em short\/} summaries that describe an entire sequence, and discover local dependencies between event types.\par We formally define the summarization problem as an optimization problem that balances shortness of the summary with accuracy of the data description. We show that this problem can be solved optimally in polynomial time by using a combination of two dynamic-programming algorithms. We also explore more efficient greedy alternatives and demonstrate that they work well on large datasets. Experiments on both synthetic and real datasets illustrate that our algorithms are efficient and produce high-quality results, and reveal interesting local structures in the data.", acknowledgement = ack-nhfb, articleno = "21", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Event sequences; log mining; summarization", } @Article{Koren:2010:FNS, author = "Yehuda Koren", title = "Factor in the neighbors: {Scalable} and accurate collaborative filtering", journal = j-TKDD, volume = "4", number = "1", pages = "1:1--1:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644874", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recommender systems provide users with personalized suggestions for products or services. These systems often rely on collaborating filtering (CF), where past transactions are analyzed in order to establish connections between users and products. The most common approach to CF is based on neighborhood models, which originate from similarities between products or users. In this work we introduce a new neighborhood model with an improved prediction accuracy. Unlike previous approaches that are based on heuristic similarities, we model neighborhood relations by minimizing a global cost function. Further accuracy improvements are achieved by extending the model to exploit both explicit and implicit feedback by the users. Past models were limited by the need to compute all pairwise similarities between items or users, which grow quadratically with input size. In particular, this limitation vastly complicates adopting user similarity models, due to the typical large number of users. Our new model solves these limitations by factoring the neighborhood model, thus making both item-item and user-user implementations scale linearly with the size of the data. The methods are tested on the Netflix data, with encouraging results.", acknowledgement = ack-nhfb, articleno = "1", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "collaborative filtering; Netflix Prize; Recommender systems", } @Article{Syed:2010:MDP, author = "Zeeshan Syed and Collin Stultz and Manolis Kellis and Piotr Indyk and John Guttag", title = "Motif discovery in physiological datasets: {A} methodology for inferring predictive elements", journal = j-TKDD, volume = "4", number = "1", pages = "2:1--2:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644875", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article, we propose a methodology for identifying predictive physiological patterns in the absence of prior knowledge. We use the principle of conservation to identify activity that consistently precedes an outcome in patients, and describe a two-stage process that allows us to efficiently search for such patterns in large datasets. This involves first transforming continuous physiological signals from patients into symbolic sequences, and then searching for patterns in these reduced representations that are strongly associated with an outcome.\par Our strategy of identifying conserved activity that is unlikely to have occurred purely by chance in symbolic data is analogous to the discovery of regulatory motifs in genomic datasets. We build upon existing work in this area, generalizing the notion of a regulatory motif and enhancing current techniques to operate robustly on non-genomic data. We also address two significant considerations associated with motif discovery in general: computational efficiency and robustness in the presence of degeneracy and noise. To deal with these issues, we introduce the concept of active regions and new subset-based techniques such as a two-layer Gibbs sampling algorithm. These extensions allow for a framework for information inference, where precursors are identified as approximately conserved activity of arbitrary complexity preceding multiple occurrences of an event.\par We evaluated our solution on a population of patients who experienced sudden cardiac death and attempted to discover electrocardiographic activity that may be associated with the endpoint of death. To assess the predictive patterns discovered, we compared likelihood scores for motifs in the sudden death population against control populations of normal individuals and those with non-fatal supraventricular arrhythmias. Our results suggest that predictive motif discovery may be able to identify clinically relevant information even in the absence of significant prior knowledge.", acknowledgement = ack-nhfb, articleno = "2", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "data mining; Gibbs sampling; inference; knowledge discovery; motifs; physiological signals", } @Article{Webb:2010:SSI, author = "Geoffrey I. Webb", title = "Self-sufficient itemsets: {An} approach to screening potentially interesting associations between items", journal = j-TKDD, volume = "4", number = "1", pages = "3:1--3:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644876", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Self-sufficient itemsets are those whose frequency cannot be explained solely by the frequency of either their subsets or of their supersets. We argue that itemsets that are not self-sufficient will often be of little interest to the data analyst, as their frequency should be expected once that of the itemsets on which their frequency depends is known. We present tests for statistically sound discovery of self-sufficient itemsets, and computational techniques that allow those tests to be applied as a post-processing step for any itemset discovery algorithm. We also present a measure for assessing the degree of potential interest in an itemset that complements these statistical measures.", acknowledgement = ack-nhfb, articleno = "3", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Association discovery; association rules; itemset discovery; itemset screening; statistical evaluation", } @Article{Plantevit:2010:MMM, author = "Marc Plantevit and Anne Laurent and Dominique Laurent and Maguelonne Teisseire and Yeow Wei Choong", title = "Mining multidimensional and multilevel sequential patterns", journal = j-TKDD, volume = "4", number = "1", pages = "4:1--4:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644877", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multidimensional databases have been designed to provide decision makers with the necessary tools to help them understand their data. This framework is different from transactional data as the datasets contain huge volumes of historicized and aggregated data defined over a set of dimensions that can be arranged through multiple levels of granularities. Many tools have been proposed to query the data and navigate through the levels of granularity. However, automatic tools are still missing to mine this type of data in order to discover regular specific patterns. In this article, we present a method for mining sequential patterns from multidimensional databases, at the same time taking advantage of the different dimensions and levels of granularity, which is original compared to existing work. The necessary definitions and algorithms are extended from regular sequential patterns to this particular case. Experiments are reported, showing the significance of this approach.", acknowledgement = ack-nhfb, articleno = "4", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "frequent patterns; hierarchy; multidimensional databases; multilevel patterns; Sequential patterns", } @Article{Zaki:2010:VVO, author = "Mohammed J. Zaki and Christopher D. Carothers and Boleslaw K. Szymanski", title = "{VOGUE}: {A} variable order hidden {Markov} model with duration based on frequent sequence mining", journal = j-TKDD, volume = "4", number = "1", pages = "5:1--5:??", month = jan, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1644873.1644878", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Tue Mar 16 18:37:37 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present VOGUE, a novel, variable order hidden Markov model with state durations, that combines two separate techniques for modeling complex patterns in sequential data: pattern mining and data modeling. VOGUE relies on a variable gap sequence mining method to extract frequent patterns with different lengths and gaps between elements. It then uses these mined sequences to build a variable order hidden Markov model (HMM), that explicitly models the gaps. The gaps implicitly model the order of the HMM, and they explicitly model the duration of each state. We apply VOGUE to a variety of real sequence data taken from domains such as protein sequence classification, Web usage logs, intrusion detection, and spelling correction. We show that VOGUE has superior classification accuracy compared to regular HMMs, higher-order HMMs, and even special purpose HMMs like HMMER, which is a state-of-the-art method for protein classification. The VOGUE implementation and the datasets used in this article are available as open-source.$^1$", acknowledgement = ack-nhfb, articleno = "5", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "Hidden Markov models; higher-order HMM; HMM with duration; sequence mining and modeling; variable-order HMM", } @Article{Vadera:2010:CCS, author = "Sunil Vadera", title = "{CSNL}: {A} cost-sensitive non-linear decision tree algorithm", journal = j-TKDD, volume = "4", number = "2", pages = "6:1--6:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754429", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article presents a new decision tree learning algorithm called CSNL that induces Cost-Sensitive Non-Linear decision trees. The algorithm is based on the hypothesis that nonlinear decision nodes provide a better basis than axis-parallel decision nodes and utilizes discriminant analysis to construct nonlinear decision trees that take account of costs of misclassification.\par The performance of the algorithm is evaluated by applying it to seventeen datasets and the results are compared with those obtained by two well known cost-sensitive algorithms, ICET and MetaCost, which generate multiple trees to obtain some of the best results to date. The results show that CSNL performs at least as well, if not better than these algorithms, in more than twelve of the datasets and is considerably faster. The use of bagging with CSNL further enhances its performance showing the significant benefits of using nonlinear decision nodes.", acknowledgement = ack-nhfb, articleno = "6", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "cost-sensitive learning; Decision tree learning", } @Article{Kandylas:2010:AKC, author = "Vasileios Kandylas and S. Phineas Upham and Lyle H. Ungar", title = "Analyzing knowledge communities using foreground and background clusters", journal = j-TKDD, volume = "4", number = "2", pages = "7:1--7:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754430", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Insight into the growth (or shrinkage) of ``knowledge communities'' of authors that build on each other's work can be gained by studying the evolution over time of clusters of documents. We cluster documents based on the documents they cite in common using the Streemer clustering method, which finds cohesive foreground clusters (the knowledge communities) embedded in a diffuse background. We build predictive models with features based on the citation structure, the vocabulary of the papers, and the affiliations and prestige of the authors and use these models to study the drivers of community growth and the predictors of how widely a paper will be cited. We find that scientific knowledge communities tend to grow more rapidly if their publications build on diverse information and use narrow vocabulary and that papers that lie on the periphery of a community have the highest impact, while those not in any community have the lowest impact.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "citation analysis; clustering; community evolution; knowledge communities; Text mining", } @Article{Ji:2010:SSL, author = "Shuiwang Ji and Lei Tang and Shipeng Yu and Jieping Ye", title = "A shared-subspace learning framework for multi-label classification", journal = j-TKDD, volume = "4", number = "2", pages = "8:1--8:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754431", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multi-label problems arise in various domains such as multi-topic document categorization, protein function prediction, and automatic image annotation. One natural way to deal with such problems is to construct a binary classifier for each label, resulting in a set of independent binary classification problems. Since multiple labels share the same input space, and the semantics conveyed by different labels are usually correlated, it is essential to exploit the correlation information contained in different labels. In this paper, we consider a general framework for extracting shared structures in multi-label classification. In this framework, a common subspace is assumed to be shared among multiple labels. We show that the optimal solution to the proposed formulation can be obtained by solving a generalized eigenvalue problem, though the problem is nonconvex. For high-dimensional problems, direct computation of the solution is expensive, and we develop an efficient algorithm for this case. One appealing feature of the proposed framework is that it includes several well-known algorithms as special cases, thus elucidating their intrinsic relationships. We further show that the proposed framework can be extended to the kernel-induced feature space. We have conducted extensive experiments on multi-topic web page categorization and automatic gene expression pattern image annotation tasks, and results demonstrate the effectiveness of the proposed formulation in comparison with several representative algorithms.", acknowledgement = ack-nhfb, articleno = "8", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "gene expression pattern image annotation; kernel methods; least squares loss; Multi-label classification; shared subspace; singular value decomposition; web page categorization", } @Article{Ruggieri:2010:DMD, author = "Salvatore Ruggieri and Dino Pedreschi and Franco Turini", title = "Data mining for discrimination discovery", journal = j-TKDD, volume = "4", number = "2", pages = "9:1--9:??", month = may, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1754428.1754432", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Sat Aug 14 17:12:30 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In the context of civil rights law, discrimination refers to unfair or unequal treatment of people based on membership to a category or a minority, without regard to individual merit. Discrimination in credit, mortgage, insurance, labor market, and education has been investigated by researchers in economics and human sciences. With the advent of automatic decision support systems, such as credit scoring systems, the ease of data collection opens several challenges to data analysts for the fight against discrimination. In this article, we introduce the problem of discovering discrimination through data mining in a dataset of historical decision records, taken by humans or by automatic systems. We formalize the processes of direct and indirect discrimination discovery by modelling protected-by-law groups and contexts where discrimination occurs in a classification rule based syntax. Basically, classification rules extracted from the dataset allow for unveiling contexts of unlawful discrimination, where the degree of burden over protected-by-law groups is formalized by an extension of the lift measure of a classification rule. In direct discrimination, the extracted rules can be directly mined in search of discriminatory contexts. In indirect discrimination, the mining process needs some background knowledge as a further input, for example, census data, that combined with the extracted rules might allow for unveiling contexts of discriminatory decisions. A strategy adopted for combining extracted classification rules with background knowledge is called an inference model. In this article, we propose two inference models and provide automatic procedures for their implementation. An empirical assessment of our results is provided on the German credit dataset and on the PKDD Discovery Challenge 1999 financial dataset.", acknowledgement = ack-nhfb, articleno = "9", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", keywords = "classification rules; Discrimination", } @Article{Thomas:2010:MMF, author = "Lini T. Thomas and Satyanarayana R. Valluri and Kamalakar Karlapalem", title = "{MARGIN}: {Maximal} frequent subgraph mining", journal = j-TKDD, volume = "4", number = "3", pages = "10:1--10:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839491", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Deodhar:2010:SFS, author = "Meghana Deodhar and Joydeep Ghosh", title = "{SCOAL}: {A} framework for simultaneous co-clustering and learning from complex data", journal = j-TKDD, volume = "4", number = "3", pages = "11:1--11:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839492", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2010:BBI, author = "Jinlin Chen and Keli Xiao", title = "{BISC}: {A} bitmap itemset support counting approach for efficient frequent itemset mining", journal = j-TKDD, volume = "4", number = "3", pages = "12:1--12:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839493", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Becchetti:2010:EAL, author = "Luca Becchetti and Paolo Boldi and Carlos Castillo and Aristides Gionis", title = "Efficient algorithms for large-scale local triangle counting", journal = j-TKDD, volume = "4", number = "3", pages = "13:1--13:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839494", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2010:MDR, author = "Yin Zhang and Zhi-Hua Zhou", title = "Multilabel dimensionality reduction via dependence maximization", journal = j-TKDD, volume = "4", number = "3", pages = "14:1--14:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839495", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Cui:2010:LMN, author = "Ying Cui and Xiaoli Z. Fern and Jennifer G. Dy", title = "Learning multiple nonredundant clusterings", journal = j-TKDD, volume = "4", number = "3", pages = "15:1--15:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1839490.1839496", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:57 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2010:TSI, author = "Wei Wang", title = "{TKDD} Special Issue: {SIGKDD 2009}", journal = j-TKDD, volume = "4", number = "4", pages = "16:1--16:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857948", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2010:BTA, author = "Ye Chen and Dmitry Pavlov and John F. Canny", title = "Behavioral Targeting: The Art of Scaling Up Simple Algorithms", journal = j-TKDD, volume = "4", number = "4", pages = "17:1--17:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857949", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mohammed:2010:CDA, author = "Noman Mohammed and Benjamin C. M. Fung and Patrick C. K. Hung and Cheuk-Kwong Lee", title = "Centralized and Distributed Anonymization for High-Dimensional Healthcare Data", journal = j-TKDD, volume = "4", number = "4", pages = "18:1--18:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857950", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2010:BBM, author = "Chao Liu and Fan Guo and Christos Faloutsos", title = "{Bayesian} Browsing Model: Exact Inference of Document Relevance from Petabyte-Scale Data", journal = j-TKDD, volume = "4", number = "4", pages = "19:1--19:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857951", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wu:2010:MAF, author = "Mingxi Wu and Chris Jermaine and Sanjay Ranka and Xiuyao Song and John Gums", title = "A Model-Agnostic Framework for Fast Spatial Anomaly Detection", journal = j-TKDD, volume = "4", number = "4", pages = "20:1--20:??", month = oct, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1857947.1857952", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:58 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhong:2010:ATS, author = "Ning Zhong and Gregory Piatetsky-Shapiro and Yiyu Yao and Philip S. Yu", title = "{ACM TKDD} Special Issue on Knowledge Discovery for {Web} Intelligence", journal = j-TKDD, volume = "5", number = "1", pages = "1:1--1:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870097", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tang:2010:CAW, author = "Jie Tang and Limin Yao and Duo Zhang and Jing Zhang", title = "A Combination Approach to {Web} User Profiling", journal = j-TKDD, volume = "5", number = "1", pages = "2:1--2:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870098", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bouguessa:2010:DKS, author = "Mohamed Bouguessa and Shengrui Wang and Benoit Dumoulin", title = "Discovering Knowledge-Sharing Communities in Question-Answering Forums", journal = j-TKDD, volume = "5", number = "1", pages = "3:1--3:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870099", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Plangprasopchok:2010:MSA, author = "Anon Plangprasopchok and Kristina Lerman", title = "Modeling Social Annotation: {A} {Bayesian} Approach", journal = j-TKDD, volume = "5", number = "1", pages = "4:1--4:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870100", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Sakurai:2010:FDG, author = "Yasushi Sakurai and Christos Faloutsos and Spiros Papadimitriou", title = "Fast Discovery of Group Lag Correlations in Streams", journal = j-TKDD, volume = "5", number = "1", pages = "5:1--5:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870101", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2010:FCP, author = "Kun Liu and Evimaria Terzi", title = "A Framework for Computing the Privacy Scores of Users in Online Social Networks", journal = j-TKDD, volume = "5", number = "1", pages = "6:1--6:??", month = dec, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1870096.1870102", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:43:59 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Sun:2011:ISI, author = "Jimeng Sun and Yan Liu and Jie Tang and Chid Apte", title = "Introduction to Special Issue on Large-Scale Data Mining", journal = j-TKDD, volume = "5", number = "2", pages = "7:1--7:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921633", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Kang:2011:HMR, author = "U. Kang and Charalampos E. Tsourakakis and Ana Paula Appel and Christos Faloutsos and Jure Leskovec", title = "{HADI}: Mining Radii of Large Graphs", journal = j-TKDD, volume = "5", number = "2", pages = "8:1--8:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921634", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{deVries:2011:RRL, author = "Timothy de Vries and Hui Ke and Sanjay Chawla and Peter Christen", title = "Robust Record Linkage Blocking Using Suffix Arrays and {Bloom} Filters", journal = j-TKDD, volume = "5", number = "2", pages = "9:1--9:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921635", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Dunlavy:2011:TLP, author = "Daniel M. Dunlavy and Tamara G. Kolda and Evrim Acar", title = "Temporal Link Prediction Using Matrix and Tensor Factorizations", journal = j-TKDD, volume = "5", number = "2", pages = "10:1--10:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921636", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Magdalinos:2011:ECQ, author = "Panagis Magdalinos and Christos Doulkeridis and Michalis Vazirgiannis", title = "Enhancing Clustering Quality through Landmark-Based Dimensionality Reduction", journal = j-TKDD, volume = "5", number = "2", pages = "11:1--11:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921637", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Cheng:2011:CLA, author = "Hong Cheng and Yang Zhou and Jeffrey Xu Yu", title = "Clustering Large Attributed Graphs: {A} Balance between Structural and Attribute Similarities", journal = j-TKDD, volume = "5", number = "2", pages = "12:1--12:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921638", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Menon:2011:FAA, author = "Aditya Krishna Menon and Charles Elkan", title = "Fast Algorithms for Approximating the Singular Value Decomposition", journal = j-TKDD, volume = "5", number = "2", pages = "13:1--13:??", month = feb, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1921632.1921639", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Mon Mar 28 11:44:01 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A low-rank approximation to a matrix $A$ is a matrix with significantly smaller rank than $A$, and which is close to $A$ according to some norm. Many practical applications involving the use of large matrices focus on low-rank approximations. By reducing the rank or dimensionality of the data, we reduce the complexity of analyzing the data. The singular value decomposition is the most popular low-rank matrix approximation. However, due to its expensive computational requirements, it has often been considered intractable for practical applications involving massive data. Recent developments have tried to address this problem, with several methods proposed to approximate the decomposition with better asymptotic runtime. We present an empirical study of these techniques on a variety of dense and sparse datasets. We find that a sampling approach of Drineas, Kannan and Mahoney is often, but not always, the best performing method. This method gives solutions with high accuracy much faster than classical SVD algorithms, on large sparse datasets in particular. Other modern methods, such as a recent algorithm by Rokhlin and Tygert, also offer savings compared to classical SVD algorithms. The older sampling methods of Achlioptas and McSherry are shown to sometimes take longer than classical SVD.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2011:IDC, author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yun Chi and Yihong Gong", title = "Integrating Document Clustering and Multidocument Summarization", journal = j-TKDD, volume = "5", number = "3", pages = "14:1--14:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993078", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Maier:2011:INS, author = "Marc Maier and Matthew Rattigan and David Jensen", title = "Indexing Network Structure with Shortest-Path Trees", journal = j-TKDD, volume = "5", number = "3", pages = "15:1--15:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993079", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wong:2011:CUA, author = "Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Ke Wang and Philip S. Yu and Jian Pei", title = "Can the Utility of Anonymized Data be Used for Privacy Breaches?", journal = j-TKDD, volume = "5", number = "3", pages = "16:1--16:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993080", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lin:2011:CDM, author = "Yu-Ru Lin and Jimeng Sun and Hari Sundaram and Aisling Kelliher and Paul Castro and Ravi Konuru", title = "Community Discovery via Metagraph Factorization", journal = j-TKDD, volume = "5", number = "3", pages = "17:1--17:??", month = aug, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1993077.1993081", ISSN = "1556-4681 (print), 1556-472X (electronic)", bibdate = "Thu Aug 18 13:28:08 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Elkan:2012:GES, author = "Charles Elkan and Yehuda Koren", title = "Guest Editorial for Special Issue {KDD'10}", journal = j-TKDD, volume = "5", number = "4", pages = "18:1--18:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086738", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Iwata:2012:SMT, author = "Tomoharu Iwata and Takeshi Yamada and Yasushi Sakurai and Naonori Ueda", title = "Sequential Modeling of Topic Dynamics with Multiple Timescales", journal = j-TKDD, volume = "5", number = "4", pages = "19:1--19:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086739", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We propose an online topic model for sequentially analyzing the time evolution of topics in document collections. Topics naturally evolve with multiple timescales. For example, some words may be used consistently over one hundred years, while other words emerge and disappear over periods of a few days. Thus, in the proposed model, current topic-specific distributions over words are assumed to be generated based on the multiscale word distributions of the previous epoch. Considering both the long- and short-timescale dependency yields a more robust model. We derive efficient online inference procedures based on a stochastic EM algorithm, in which the model is sequentially updated using newly obtained data; this means that past data are not required to make the inference. We demonstrate the effectiveness of the proposed method in terms of predictive performance and computational efficiency by examining collections of real documents with timestamps.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huh:2012:DTM, author = "Seungil Huh and Stephen E. Fienberg", title = "Discriminative Topic Modeling Based on Manifold Learning", journal = j-TKDD, volume = "5", number = "4", pages = "20:1--20:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086740", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Topic modeling has become a popular method used for data analysis in various domains including text documents. Previous topic model approaches, such as probabilistic Latent Semantic Analysis (pLSA) and Latent Dirichlet Allocation (LDA), have shown impressive success in discovering low-rank hidden structures for modeling text documents. These approaches, however do not take into account the manifold structure of the data, which is generally informative for nonlinear dimensionality reduction mapping. More recent topic model approaches, Laplacian PLSI (LapPLSI) and Locally-consistent Topic Model (LTM), have incorporated the local manifold structure into topic models and have shown resulting benefits. But they fall short of achieving full discriminating power of manifold learning as they only enhance the proximity between the low-rank representations of neighboring pairs without any consideration for non-neighboring pairs. In this article, we propose a new approach, Discriminative Topic Model (DTM), which separates non-neighboring pairs from each other in addition to bringing neighboring pairs closer together, thereby preserving the global manifold structure as well as improving local consistency. We also present a novel model-fitting algorithm based on the generalized EM algorithm and the concept of Pareto improvement. We empirically demonstrate the success of DTM in terms of unsupervised clustering and semisupervised classification accuracies on text corpora and robustness to parameters compared to state-of-the-art techniques.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gomez-Rodriguez:2012:IND, author = "Manuel Gomez-Rodriguez and Jure Leskovec and Andreas Krause", title = "Inferring Networks of Diffusion and Influence", journal = j-TKDD, volume = "5", number = "4", pages = "21:1--21:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086741", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Information diffusion and virus propagation are fundamental processes taking place in networks. While it is often possible to directly observe when nodes become infected with a virus or publish the information, observing individual transmissions (who infects whom, or who influences whom) is typically very difficult. Furthermore, in many applications, the underlying network over which the diffusions and propagations spread is actually unobserved. We tackle these challenges by developing a method for tracing paths of diffusion and influence through networks and inferring the networks over which contagions propagate. Given the times when nodes adopt pieces of information or become infected, we identify the optimal network that best explains the observed infection times. Since the optimization problem is NP-hard to solve exactly, we develop an efficient approximation algorithm that scales to large datasets and finds provably near-optimal networks. We demonstrate the effectiveness of our approach by tracing information diffusion in a set of 170 million blogs and news articles over a one year period to infer how information flows through the online media space. We find that the diffusion network of news for the top 1,000 media sites and blogs tends to have a core-periphery structure with a small set of core media sites that diffuse information to the rest of the Web. These sites tend to have stable circles of influence with more general news media sites acting as connectors between them.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2012:LIS, author = "Jianhui Chen and Ji Liu and Jieping Ye", title = "Learning Incoherent Sparse and Low-Rank Patterns from Multiple Tasks", journal = j-TKDD, volume = "5", number = "4", pages = "22:1--22:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086742", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We consider the problem of learning incoherent sparse and low-rank patterns from multiple tasks. Our approach is based on a linear multitask learning formulation, in which the sparse and low-rank patterns are induced by a cardinality regularization term and a low-rank constraint, respectively. This formulation is nonconvex; we convert it into its convex surrogate, which can be routinely solved via semidefinite programming for small-size problems. We propose employing the general projected gradient scheme to efficiently solve such a convex surrogate; however, in the optimization formulation, the objective function is nondifferentiable and the feasible domain is nontrivial. We present the procedures for computing the projected gradient and ensuring the global convergence of the projected gradient scheme. The computation of the projected gradient involves a constrained optimization problem; we show that the optimal solution to such a problem can be obtained via solving an unconstrained optimization subproblem and a Euclidean projection subproblem. We also present two projected gradient algorithms and analyze their rates of convergence in detail. In addition, we illustrate the use of the presented projected gradient algorithms for the proposed multitask learning formulation using the least squares loss. Experimental results on a collection of real-world data sets demonstrate the effectiveness of the proposed multitask learning formulation and the efficiency of the proposed projected gradient algorithms.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2012:LLC, author = "Hsiang-Fu Yu and Cho-Jui Hsieh and Kai-Wei Chang and Chih-Jen Lin", title = "Large Linear Classification When Data Cannot Fit in Memory", journal = j-TKDD, volume = "5", number = "4", pages = "23:1--23:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086743", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recent advances in linear classification have shown that for applications such as document classification, the training process can be extremely efficient. However, most of the existing training methods are designed by assuming that data can be stored in the computer memory. These methods cannot be easily applied to data larger than the memory capacity due to the random access to the disk. We propose and analyze a block minimization framework for data larger than the memory size. At each step a block of data is loaded from the disk and handled by certain learning methods. We investigate two implementations of the proposed framework for primal and dual SVMs, respectively. Because data cannot fit in memory, many design considerations are very different from those for traditional algorithms. We discuss and compare with existing approaches that are able to handle data larger than memory. Experiments using data sets 20 times larger than the memory demonstrate the effectiveness of the proposed method.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Shahaf:2012:CTL, author = "Dafna Shahaf and Carlos Guestrin", title = "Connecting Two (or Less) Dots: Discovering Structure in News Articles", journal = j-TKDD, volume = "5", number = "4", pages = "24:1--24:??", month = feb, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086737.2086744", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 16 15:19:57 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Finding information is becoming a major part of our daily life. Entire sectors, from Web users to scientists and intelligence analysts, are increasingly struggling to keep up with the larger and larger amounts of content published every day. With this much data, it is often easy to miss the big picture. In this article, we investigate methods for automatically connecting the dots---providing a structured, easy way to navigate within a new topic and discover hidden connections. We focus on the news domain: given two news articles, our system automatically finds a coherent chain linking them together. For example, it can recover the chain of events starting with the decline of home prices (January 2007), and ending with the health care debate (2009). We formalize the characteristics of a good chain and provide a fast search-driven algorithm to connect two fixed endpoints. We incorporate user feedback into our framework, allowing the stories to be refined and personalized. We also provide a method to handle partially-specified endpoints, for users who do not know both ends of a story. Finally, we evaluate our algorithm over real news data. Our user studies demonstrate that the objective we propose captures the users' intuitive notion of coherence, and that our algorithm effectively helps users understand the news.", acknowledgement = ack-nhfb, articleno = "24", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ienco:2012:CDL, author = "Dino Ienco and Ruggero G. Pensa and Rosa Meo", title = "From Context to Distance: Learning Dissimilarity for Categorical Data Clustering", journal = j-TKDD, volume = "6", number = "1", pages = "1:1--1:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133361", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Clustering data described by categorical attributes is a challenging task in data mining applications. Unlike numerical attributes, it is difficult to define a distance between pairs of values of a categorical attribute, since the values are not ordered. In this article, we propose a framework to learn a context-based distance for categorical attributes. The key intuition of this work is that the distance between two values of a categorical attribute A$_i$ can be determined by the way in which the values of the other attributes A$_j$ are distributed in the dataset objects: if they are similarly distributed in the groups of objects in correspondence of the distinct values of A$_i$ a low value of distance is obtained. We propose also a solution to the critical point of the choice of the attributes A$_j$. We validate our approach by embedding our distance learning framework in a hierarchical clustering algorithm. We applied it on various real world and synthetic datasets, both low and high-dimensional. Experimental results show that our method is competitive with respect to the state of the art of categorical data clustering approaches. We also show that our approach is scalable and has a low impact on the overall computational time of a clustering task.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Li:2012:EMG, author = "Chun Li and Qingyan Yang and Jianyong Wang and Ming Li", title = "Efficient Mining of Gap-Constrained Subsequences and Its Various Applications", journal = j-TKDD, volume = "6", number = "1", pages = "2:1--2:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133362", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Mining frequent subsequence patterns is a typical data-mining problem and various efficient sequential pattern mining algorithms have been proposed. In many application domains (e.g., biology), the frequent subsequences confined by the predefined gap requirements are more meaningful than the general sequential patterns. In this article, we propose two algorithms, Gap-BIDE for mining closed gap-constrained subsequences from a set of input sequences, and Gap-Connect for mining repetitive gap-constrained subsequences from a single input sequence. Inspired by some state-of-the-art closed or constrained sequential pattern mining algorithms, the Gap-BIDE algorithm adopts an efficient approach to finding the complete set of closed sequential patterns with gap constraints, while the Gap-Connect algorithm efficiently mines an approximate set of long patterns by connecting short patterns. We also present several methods for feature selection from the set of gap-constrained patterns for the purpose of classification and clustering. Our extensive performance study shows that our approaches are very efficient in mining frequent subsequences with gap constraints, and the gap-constrained pattern based classification/clustering approaches can achieve high-quality results.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2012:IBA, author = "Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou", title = "Isolation-Based Anomaly Detection", journal = j-TKDD, volume = "6", number = "1", pages = "3:1--3:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133363", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Anomalies are data points that are few and different. As a result of these properties, we show that, anomalies are susceptible to a mechanism called isolation. This article proposes a method called Isolation Forest ($i$ Forest), which detects anomalies purely based on the concept of isolation without employing any distance or density measure---fundamentally different from all existing methods. As a result, $i$ Forest is able to exploit subsampling (i) to achieve a low linear time-complexity and a small memory-requirement and (ii) to deal with the effects of swamping and masking effectively. Our empirical evaluation shows that $i$ Forest outperforms ORCA, one-class SVM, LOF and Random Forests in terms of AUC, processing time, and it is robust against masking and swamping effects. $i$ Forest also works well in high dimensional problems containing a large number of irrelevant attributes, and when anomalies are not available in training sample.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jin:2012:MML, author = "Yu Jin and Nick Duffield and Jeffrey Erman and Patrick Haffner and Subhabrata Sen and Zhi-Li Zhang", title = "A Modular Machine Learning System for Flow-Level Traffic Classification in Large Networks", journal = j-TKDD, volume = "6", number = "1", pages = "4:1--4:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133360.2133364", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The ability to accurately and scalably classify network traffic is of critical importance to a wide range of management tasks of large networks, such as tier-1 ISP networks and global enterprise networks. Guided by the practical constraints and requirements of traffic classification in large networks, in this article, we explore the design of an accurate and scalable machine learning based flow-level traffic classification system, which is trained on a dataset of flow-level data that has been annotated with application protocol labels by a packet-level classifier. Our system employs a lightweight modular architecture, which combines a series of simple linear binary classifiers, each of which can be efficiently implemented and trained on vast amounts of flow data in parallel, and embraces three key innovative mechanisms, weighted threshold sampling, logistic calibration, and intelligent data partitioning, to achieve scalability while attaining high accuracy. Evaluations using real traffic data from multiple locations in a large ISP show that our system accurately reproduces the labels of the packet level classifier when runs on (unlabeled) flow records, while meeting the scalability and stability requirements of large ISP networks. Using training and test datasets that are two months apart and collected from two different locations, the flow error rates are only 3\% for TCP flows and 0.4\% for UDP flows. We further show that such error rates can be reduced by combining the information of spatial distributions of flows, or collective traffic statistics, during classification. We propose a novel two-step model, which seamlessly integrates these collective traffic statistics into the existing traffic classification system. Experimental results display performance improvement on all traffic classes and an overall error rate reduction by 15\%. In addition to a high accuracy, at runtime, our implementation easily scales to classify traffic on 10Gbps links.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mavroeidis:2012:SSF, author = "Dimitrios Mavroeidis and Panagis Magdalinos", title = "A Sequential Sampling Framework for Spectral $k$-Means Based on Efficient Bootstrap Accuracy Estimations: Application to Distributed Clustering", journal = j-TKDD, volume = "6", number = "2", pages = "5:1--5:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297457", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The scalability of learning algorithms has always been a central concern for data mining researchers, and nowadays, with the rapid increase in data storage capacities and availability, its importance has increased. To this end, sampling has been studied by several researchers in an effort to derive sufficiently accurate models using only small data fractions. In this article we focus on spectral $k$-means, that is, the $k$-means approximation as derived by the spectral relaxation, and propose a sequential sampling framework that iteratively enlarges the sample size until the $k$-means results (objective function and cluster structure) become indistinguishable from the asymptotic (infinite-data) output. In the proposed framework we adopt a commonly applied principle in data mining research that considers the use of minimal assumptions concerning the data generating distribution. This restriction imposes several challenges, mainly related to the efficiency of the sequential sampling procedure. These challenges are addressed using elements of matrix perturbation theory and statistics. Moreover, although the main focus is on spectral $k$-means, we also demonstrate that the proposed framework can be generalized to handle spectral clustering. The proposed sequential sampling framework is consecutively employed for addressing the distributed clustering problem, where the task is to construct a global model for data that resides in distributed network nodes. The main challenge in this context is related to the bandwidth constraints that are commonly imposed, thus requiring that the distributed clustering algorithm consumes a minimal amount of network load. This illustrates the applicability of the proposed approach, as it enables the determination of a minimal sample size that can be used for constructing an accurate clustering model that entails the distributional characteristics of the data. As opposed to the relevant distributed $k$-means approaches, our framework takes into account the fact that the choice of the number of clusters has a crucial effect on the required amount of communication. More precisely, the proposed algorithm is able to derive a statistical estimation of the required relative sizes for all possible values of $k$. This unique feature of our distributed clustering framework enables a network administrator to choose an economic solution that identifies the crude cluster structure of a dataset and not devote excessive network resources for identifying all the ``correct'' detailed clusters.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Das:2012:MIG, author = "Sanmay Das and Malik Magdon-Ismail", title = "A Model for Information Growth in Collective Wisdom Processes", journal = j-TKDD, volume = "6", number = "2", pages = "6:1--6:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297458", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Collaborative media such as wikis have become enormously successful venues for information creation. Articles accrue information through the asynchronous editing of users who arrive both seeking information and possibly able to contribute information. Most articles stabilize to high-quality, trusted sources of information representing the collective wisdom of all the users who edited the article. We propose a model for information growth which relies on two main observations: (i) as an article's quality improves, it attracts visitors at a faster rate (a rich-get-richer phenomenon); and, simultaneously, (ii) the chances that a new visitor will improve the article drops (there is only so much that can be said about a particular topic). Our model is able to reproduce many features of the edit dynamics observed on Wikipedia; in particular, it captures the observed rise in the edit rate, followed by $1/ t$ decay. Despite differences in the media, we also document similar features in the comment rates for a segment of the LiveJournal blogosphere.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xu:2012:GME, author = "Tianbing Xu and Zhongfei Zhang and Philip S. Yu and Bo Long", title = "Generative Models for Evolutionary Clustering", journal = j-TKDD, volume = "6", number = "2", pages = "7:1--7:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297459", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article studies evolutionary clustering, a recently emerged hot topic with many important applications, noticeably in dynamic social network analysis. In this article, based on the recent literature on nonparametric Bayesian models, we have developed two generative models: DPChain and HDP-HTM. DPChain is derived from the Dirichlet process mixture (DPM) model, with an exponential decaying component along with the time. HDP-HTM combines the hierarchical dirichlet process (HDP) with a hierarchical transition matrix (HTM) based on the proposed Infinite hierarchical Markov state model (iHMS). Both models substantially advance the literature on evolutionary clustering, in the sense that not only do they both perform better than those in the existing literature, but more importantly, they are capable of automatically learning the cluster numbers and explicitly addressing the corresponding issues. Extensive evaluations have demonstrated the effectiveness and the promise of these two solutions compared to the state-of-the-art literature.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2012:LME, author = "Shaojun Wang and Dale Schuurmans and Yunxin Zhao", title = "The Latent Maximum Entropy Principle", journal = j-TKDD, volume = "6", number = "2", pages = "8:1--8:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297460", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present an extension to Jaynes' maximum entropy principle that incorporates latent variables. The principle of latent maximum entropy we propose is different from both Jaynes' maximum entropy principle and maximum likelihood estimation, but can yield better estimates in the presence of hidden variables and limited training data. We first show that solving for a latent maximum entropy model poses a hard nonlinear constrained optimization problem in general. However, we then show that feasible solutions to this problem can be obtained efficiently for the special case of log-linear models---which forms the basis for an efficient approximation to the latent maximum entropy principle. We derive an algorithm that combines expectation-maximization with iterative scaling to produce feasible log-linear solutions. This algorithm can be interpreted as an alternating minimization algorithm in the information divergence, and reveals an intimate connection between the latent maximum entropy and maximum likelihood principles. To select a final model, we generate a series of feasible candidates, calculate the entropy of each, and choose the model that attains the highest entropy. Our experimental results show that estimation based on the latent maximum entropy principle generally gives better results than maximum likelihood when estimating latent variable models on small observed data samples.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bhattacharya:2012:CGC, author = "Indrajit Bhattacharya and Shantanu Godbole and Sachindra Joshi and Ashish Verma", title = "Cross-Guided Clustering: Transfer of Relevant Supervision across Tasks", journal = j-TKDD, volume = "6", number = "2", pages = "9:1--9:??", month = jul, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2297456.2297461", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:38 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Lack of supervision in clustering algorithms often leads to clusters that are not useful or interesting to human reviewers. We investigate if supervision can be automatically transferred for clustering a target task, by providing a relevant supervised partitioning of a dataset from a different source task. The target clustering is made more meaningful for the human user by trading-off intrinsic clustering goodness on the target task for alignment with relevant supervised partitions in the source task, wherever possible. We propose a cross-guided clustering algorithm that builds on traditional k-means by aligning the target clusters with source partitions. The alignment process makes use of a cross-task similarity measure that discovers hidden relationships across tasks. When the source and target tasks correspond to different domains with potentially different vocabularies, we propose a projection approach using pivot vocabularies for the cross-domain similarity measure. Using multiple real-world and synthetic datasets, we show that our approach improves clustering accuracy significantly over traditional k-means and state-of-the-art semi-supervised clustering baselines, over a wide range of data characteristics and parameter settings.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2012:LBN, author = "Zhenxing Wang and Laiwan Chan", title = "Learning {Bayesian} networks from {Markov} random fields: an efficient algorithm for linear models", journal = j-TKDD, volume = "6", number = "3", pages = "10:1--10:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362384", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Dependency analysis is a typical approach for Bayesian network learning, which infers the structures of Bayesian networks by the results of a series of conditional independence (CI) tests. In practice, testing independence conditioning on large sets hampers the performance of dependency analysis algorithms in terms of accuracy and running time for the following reasons. First, testing independence on large sets of variables with limited samples is not stable. Second, for most dependency analysis algorithms, the number of CI tests grows at an exponential rate with the sizes of conditioning sets, and the running time grows of the same rate. Therefore, determining how to reduce the number of CI tests and the sizes of conditioning sets becomes a critical step in dependency analysis algorithms. In this article, we address a two-phase algorithm based on the observation that the structures of Markov random fields are similar to those of Bayesian networks. The first phase of the algorithm constructs a Markov random field from data, which provides a close approximation to the structure of the true Bayesian network; the second phase of the algorithm removes redundant edges according to CI tests to get the true Bayesian network. Both phases use Markov blanket information to reduce the sizes of conditioning sets and the number of CI tests without sacrificing accuracy. An empirical study shows that the two-phase algorithm performs well in terms of accuracy and efficiency.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chan:2012:CID, author = "Jeffrey Chan and James Bailey and Christopher Leckie and Michael Houle", title = "{ciForager}: Incrementally discovering regions of correlated change in evolving graphs", journal = j-TKDD, volume = "6", number = "3", pages = "11:1--11:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362385", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Data mining techniques for understanding how graphs evolve over time have become increasingly important. Evolving graphs arise naturally in diverse applications such as computer network topologies, multiplayer games and medical imaging. A natural and interesting problem in evolving graph analysis is the discovery of compact subgraphs that change in a similar manner. Such subgraphs are known as regions of correlated change and they can both summarise change patterns in graphs and help identify the underlying events causing these changes. However, previous techniques for discovering regions of correlated change suffer from limited scalability, making them unsuitable for analysing the evolution of very large graphs. In this paper, we introduce a new algorithm called ciForager, that addresses this scalability challenge and offers considerable improvements. The efficiency of ciForager is based on the use of new incremental techniques for detecting change, as well as the use of Voronoi representations for efficiently determining distance. We experimentally show that ciForager can achieve speedups of up to 1000 times over previous approaches. As a result, it becomes feasible for the first time to discover regions of correlated change in extremely large graphs, such as the entire BGP routing topology of the Internet.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2012:CDS, author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong Gong", title = "Comparative document summarization via discriminative sentence selection", journal = j-TKDD, volume = "6", number = "3", pages = "12:1--12:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362386", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given a collection of document groups, a natural question is to identify the differences among them. Although traditional document summarization techniques can summarize the content of the document groups one by one, there exists a great necessity to generate a summary of the differences among the document groups. In this article, we study a novel problem, that of summarizing the differences between document groups. A discriminative sentence selection method is proposed to extract the most discriminative sentences which represent the specific characteristics of each document group. Experiments and case studies on real-world data sets demonstrate the effectiveness of our proposed method.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{deMelo:2012:FNO, author = "Pedro O. S. {Vaz de Melo} and Virgilio A. F. Almeida and Antonio A. F. Loureiro and Christos Faloutsos", title = "Forecasting in the {NBA} and other team sports: Network effects in action", journal = j-TKDD, volume = "6", number = "3", pages = "13:1--13:??", month = oct, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2362383.2362387", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Nov 6 18:30:40 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The multi-million sports-betting market is based on the fact that the task of predicting the outcome of a sports event is very hard. Even with the aid of an uncountable number of descriptive statistics and background information, only a few can correctly guess the outcome of a game or a league. In this work, our approach is to move away from the traditional way of predicting sports events, and instead to model sports leagues as networks of players and teams where the only information available is the work relationships among them. We propose two network-based models to predict the behavior of teams in sports leagues. These models are parameter-free, that is, they do not have a single parameter, and moreover are sport-agnostic: they can be applied directly to any team sports league. First, we view a sports league as a network in evolution, and we infer the implicit feedback behind network changes and properties over the years. Then, we use this knowledge to construct the network-based prediction models, which can, with a significantly high probability, indicate how well a team will perform over a season. We compare our proposed models with other prediction models in two of the most popular sports leagues: the National Basketball Association (NBA) and the Major League Baseball (MLB). Our model shows consistently good results in comparison with the other models and, relying upon the network properties of the teams, we achieved a $\approx 14\%$ rank prediction accuracy improvement over our best competitor.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ghosh:2012:SIB, author = "Joydeep Ghosh and Padhraic Smyth and Andrew Tomkins and Rich Caruana", title = "Special issue on best of {SIGKDD 2011}", journal = j-TKDD, volume = "6", number = "4", pages = "14:1--14:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382578", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Kaufman:2012:LDM, author = "Shachar Kaufman and Saharon Rosset and Claudia Perlich and Ori Stitelman", title = "Leakage in data mining: Formulation, detection, and avoidance", journal = j-TKDD, volume = "6", number = "4", pages = "15:1--15:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382579", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Deemed ``one of the top ten data mining mistakes'', leakage is the introduction of information about the data mining target that should not be legitimately available to mine from. In addition to our own industry experience with real-life projects, controversies around several major public data mining competitions held recently such as the INFORMS 2010 Data Mining Challenge and the IJCNN 2011 Social Network Challenge are evidence that this issue is as relevant today as it has ever been. While acknowledging the importance and prevalence of leakage in both synthetic competitions and real-life data mining projects, existing literature has largely left this idea unexplored. What little has been said turns out not to be broad enough to cover more complex cases of leakage, such as those where the classical independently and identically distributed (i.i.d.) assumption is violated, that have been recently documented. In our new approach, these cases and others are explained by explicitly defining modeling goals and analyzing the broader framework of the data mining problem. The resulting definition enables us to derive general methodology for dealing with the issue. We show that it is possible to avoid leakage with a simple specific approach to data management followed by what we call a learn-predict separation, and present several ways of detecting leakage when the modeler has no control over how the data have been collected. We also offer an alternative point of view on leakage that is based on causal graph modeling concepts.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mampaey:2012:SDS, author = "Michael Mampaey and Jilles Vreeken and Nikolaj Tatti", title = "Summarizing data succinctly with the most informative itemsets", journal = j-TKDD, volume = "6", number = "4", pages = "16:1--16:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382580", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Knowledge discovery from data is an inherently iterative process. That is, what we know about the data greatly determines our expectations, and therefore, what results we would find interesting and/or surprising. Given new knowledge about the data, our expectations will change. Hence, in order to avoid redundant results, knowledge discovery algorithms ideally should follow such an iterative updating procedure. With this in mind, we introduce a well-founded approach for succinctly summarizing data with the most informative itemsets; using a probabilistic maximum entropy model, we iteratively find the itemset that provides us the most novel information-that is, for which the frequency in the data surprises us the most-and in turn we update our model accordingly. As we use the maximum entropy principle to obtain unbiased probabilistic models, and only include those itemsets that are most informative with regard to the current model, the summaries we construct are guaranteed to be both descriptive and nonredundant. The algorithm that we present, called mtv, can either discover the top- k most informative itemsets, or we can employ either the Bayesian Information Criterion (bic) or the Minimum Description Length (mdl) principle to automatically identify the set of itemsets that together summarize the data well. In other words, our method will ``tell you what you need to know'' about the data. Importantly, it is a one-phase algorithm: rather than picking itemsets from a user-provided candidate set, itemsets and their supports are mined on-the-fly. To further its applicability, we provide an efficient method to compute the maximum entropy distribution using Quick Inclusion-Exclusion. Experiments on our method, using synthetic, benchmark, and real data, show that the discovered summaries are succinct, and correctly identify the key patterns in the data. The models they form attain high likelihoods, and inspection shows that they summarize the data well with increasingly specific, yet nonredundant itemsets.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chu:2012:TLM, author = "Shumo Chu and James Cheng", title = "Triangle listing in massive networks", journal = j-TKDD, volume = "6", number = "4", pages = "17:1--17:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382581", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Triangle listing is one of the fundamental algorithmic problems whose solution has numerous applications especially in the analysis of complex networks, such as the computation of clustering coefficients, transitivity, triangular connectivity, trusses, etc. Existing algorithms for triangle listing are mainly in-memory algorithms, whose performance cannot scale with the massive volume of today's fast growing networks. When the input graph cannot fit in main memory, triangle listing requires random disk accesses that can incur prohibitively huge I/O cost. Some streaming, semistreaming, and sampling algorithms have been proposed but these are approximation algorithms. We propose an I/O-efficient algorithm for triangle listing. Our algorithm is exact and avoids random disk access. Our results show that our algorithm is scalable and outperforms the state-of-the-art in-memory and local triangle estimation algorithms.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chattopadhyay:2012:MDA, author = "Rita Chattopadhyay and Qian Sun and Wei Fan and Ian Davidson and Sethuraman Panchanathan and Jieping Ye", title = "Multisource domain adaptation and its application to early detection of fatigue", journal = j-TKDD, volume = "6", number = "4", pages = "18:1--18:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382582", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We consider the characterization of muscle fatigue through a noninvasive sensing mechanism such as Surface ElectroMyoGraphy (SEMG). While changes in the properties of SEMG signals with respect to muscle fatigue have been reported in the literature, the large variation in these signals across different individuals makes the task of modeling and classification of SEMG signals challenging. Indeed, the variation in SEMG parameters from subject to subject creates differences in the data distribution. In this article, we propose two transfer learning frameworks based on the multisource domain adaptation methodology for detecting different stages of fatigue using SEMG signals, that addresses the distribution differences. In the proposed frameworks, the SEMG data of a subject represent a domain; data from multiple subjects in the training set form the multiple source domains and the test subject data form the target domain. SEMG signals are predominantly different in conditional probability distribution across subjects. The key feature of the first framework is a novel weighting scheme that addresses the conditional probability distribution differences across multiple domains (subjects) and the key feature of the second framework is a two-stage domain adaptation methodology which combines weighted data from multiple sources based on marginal probability differences (first stage) as well as conditional probability differences (second stage), with the target domain data. The weights for minimizing the marginal probability differences are estimated independently, while the weights for minimizing conditional probability differences are computed simultaneously by exploiting the potential interaction among multiple sources. We also provide a theoretical analysis on the generalization performance of the proposed multisource domain adaptation formulation using the weighted Rademacher complexity measure. We have validated the proposed frameworks on Surface ElectroMyoGram signals collected from 8 people during a fatigue-causing repetitive gripping activity. Comprehensive experiments on the SEMG dataset demonstrate that the proposed method improves the classification accuracy by 20\% to 30\% over the cases without any domain adaptation method and by 13\% to 30\% over existing state-of-the-art domain adaptation methods.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wilkinson:2012:SIS, author = "Leland Wilkinson and Anushka Anand and Tuan Nhon Dang", title = "Substantial improvements in the set-covering projection classifier {CHIRP} (composite hypercubes on iterated random projections)", journal = j-TKDD, volume = "6", number = "4", pages = "19:1--19:??", month = dec, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2382577.2382583", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:40 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In Wilkinson et al. [2011] we introduced a new set-covering random projection classifier that achieved average error lower than that of other classifiers in the Weka platform. This classifier was based on an $L^\infty$ norm distance function and exploited an iterative sequence of three stages (projecting, binning, and covering) to deal with the curse of dimensionality, computational complexity, and nonlinear separability. We now present substantial changes that improve robustness and reduce training and testing time by almost an order of magnitude without jeopardizing CHIRP's outstanding error performance.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Angiulli:2013:NNB, author = "Fabrizio Angiulli and Fabio Fassetti", title = "Nearest Neighbor-Based Classification of Uncertain Data", journal = j-TKDD, volume = "7", number = "1", pages = "1:1--1:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435210", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This work deals with the problem of classifying uncertain data. With this aim we introduce the Uncertain Nearest Neighbor (UNN) rule, which represents the generalization of the deterministic nearest neighbor rule to the case in which uncertain objects are available. The UNN rule relies on the concept of nearest neighbor class, rather than on that of nearest neighbor object. The nearest neighbor class of a test object is the class that maximizes the probability of providing its nearest neighbor. The evidence is that the former concept is much more powerful than the latter in the presence of uncertainty, in that it correctly models the right semantics of the nearest neighbor decision rule when applied to the uncertain scenario. An effective and efficient algorithm to perform uncertain nearest neighbor classification of a generic (un)certain test object is designed, based on properties that greatly reduce the temporal cost associated with nearest neighbor class probability computation. Experimental results are presented, showing that the UNN rule is effective and efficient in classifying uncertain data.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2013:CDS, author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong Gong", title = "Comparative Document Summarization via Discriminative Sentence Selection", journal = j-TKDD, volume = "7", number = "1", pages = "2:1--2:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435211", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given a collection of document groups, a natural question is to identify the differences among these groups. Although traditional document summarization techniques can summarize the content of the document groups one by one, there exists a great necessity to generate a summary of the differences among the document groups. In this article, we study a novel problem of summarizing the differences between document groups. A discriminative sentence selection method is proposed to extract the most discriminative sentences that represent the specific characteristics of each document group. Experiments and case studies on real-world data sets demonstrate the effectiveness of our proposed method.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bayati:2013:MPA, author = "Mohsen Bayati and David F. Gleich and Amin Saberi and Ying Wang", title = "Message-Passing Algorithms for Sparse Network Alignment", journal = j-TKDD, volume = "7", number = "1", pages = "3:1--3:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435212", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Network alignment generalizes and unifies several approaches for forming a matching or alignment between the vertices of two graphs. We study a mathematical programming framework for network alignment problem and a sparse variation of it where only a small number of matches between the vertices of the two graphs are possible. We propose a new message passing algorithm that allows us to compute, very efficiently, approximate solutions to the sparse network alignment problems with graph sizes as large as hundreds of thousands of vertices. We also provide extensive simulations comparing our algorithms with two of the best solvers for network alignment problems on two synthetic matching problems, two bioinformatics problems, and three large ontology alignment problems including a multilingual problem with a known labeled alignment.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Li:2013:CWM, author = "Bin Li and Steven C. H. Hoi and Peilin Zhao and Vivekanand Gopalkrishnan", title = "Confidence Weighted Mean Reversion Strategy for Online Portfolio Selection", journal = j-TKDD, volume = "7", number = "1", pages = "4:1--4:??", month = mar, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2435209.2435213", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Jun 24 13:02:44 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Online portfolio selection has been attracting increasing attention from the data mining and machine learning communities. All existing online portfolio selection strategies focus on the first order information of a portfolio vector, though the second order information may also be beneficial to a strategy. Moreover, empirical evidence shows that relative stock prices may follow the mean reversion property, which has not been fully exploited by existing strategies. This article proposes a novel online portfolio selection strategy named Confidence Weighted Mean Reversion (CWMR). Inspired by the mean reversion principle in finance and confidence weighted online learning technique in machine learning, CWMR models the portfolio vector as a Gaussian distribution, and sequentially updates the distribution by following the mean reversion trading principle. CWMR's closed-form updates clearly reflect the mean reversion trading idea. We also present several variants of CWMR algorithms, including a CWMR mixture algorithm that is theoretical universal. Empirically, CWMR strategy is able to effectively exploit the power of mean reversion for online portfolio selection. Extensive experiments on various real markets show that the proposed strategy is superior to the state-of-the-art techniques. The experimental testbed including source codes and data sets is available online.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lou:2013:LPR, author = "Tiancheng Lou and Jie Tang and John Hopcroft and Zhanpeng Fang and Xiaowen Ding", title = "Learning to predict reciprocity and triadic closure in social networks", journal = j-TKDD, volume = "7", number = "2", pages = "5:1--5:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499908", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study how links are formed in social networks. In particular, we focus on investigating how a reciprocal (two-way) link, the basic relationship in social networks, is developed from a parasocial (one-way) relationship and how the relationships further develop into triadic closure, one of the fundamental processes of link formation. We first investigate how geographic distance and interactions between users influence the formation of link structure among users. Then we study how social theories including homophily, social balance, and social status are satisfied over networks with parasocial and reciprocal relationships. The study unveils several interesting phenomena. For example, ``friend's friend is a friend'' indeed exists in the reciprocal relationship network, but does not hold in the parasocial relationship network. We propose a learning framework to formulate the problems of predicting reciprocity and triadic closure into a graphical model. We demonstrate that it is possible to accurately infer 90\% of reciprocal relationships in a Twitter network. The proposed model also achieves better performance (+20--30\% in terms of F1-measure) than several alternative methods for predicting the triadic closure formation.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yang:2013:EOL, author = "Haiqin Yang and Michael R. Lyu and Irwin King", title = "Efficient online learning for multitask feature selection", journal = j-TKDD, volume = "7", number = "2", pages = "6:1--6:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499909", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Learning explanatory features across multiple related tasks, or MultiTask Feature Selection (MTFS), is an important problem in the applications of data mining, machine learning, and bioinformatics. Previous MTFS methods fulfill this task by batch-mode training. This makes them inefficient when data come sequentially or when the number of training data is so large that they cannot be loaded into the memory simultaneously. In order to tackle these problems, we propose a novel online learning framework to solve the MTFS problem. A main advantage of the online algorithm is its efficiency in both time complexity and memory cost. The weights of the MTFS models at each iteration can be updated by closed-form solutions based on the average of previous subgradients. This yields the worst-case bounds of the time complexity and memory cost at each iteration, both in the order of O ( d $ \times $ Q ), where d is the number of feature dimensions and Q is the number of tasks. Moreover, we provide theoretical analysis for the average regret of the online learning algorithms, which also guarantees the convergence rate of the algorithms. Finally, we conduct detailed experiments to show the characteristics and merits of the online learning algorithms in solving several MTFS problems.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2013:MRL, author = "Yu Zhang and Dit-Yan Yeung", title = "Multilabel relationship learning", journal = j-TKDD, volume = "7", number = "2", pages = "7:1--7:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499910", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multilabel learning problems are commonly found in many applications. A characteristic shared by many multilabel learning problems is that some labels have significant correlations between them. In this article, we propose a novel multilabel learning method, called MultiLabel Relationship Learning (MLRL), which extends the conventional support vector machine by explicitly learning and utilizing the relationships between labels. Specifically, we model the label relationships using a label covariance matrix and use it to define a new regularization term for the optimization problem. MLRL learns the model parameters and the label covariance matrix simultaneously based on a unified convex formulation. To solve the convex optimization problem, we use an alternating method in which each subproblem can be solved efficiently. The relationship between MLRL and two widely used maximum margin methods for multilabel learning is investigated. Moreover, we also propose a semisupervised extension of MLRL, called SSMLRL, to demonstrate how to make use of unlabeled data to help learn the label covariance matrix. Through experiments conducted on some multilabel applications, we find that MLRL not only gives higher classification accuracy but also has better interpretability as revealed by the label covariance matrix.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Peng:2013:EFF, author = "Jing Peng and Guna Seetharaman and Wei Fan and Aparna Varde", title = "Exploiting {Fisher} and {Fukunaga--Koontz} transforms in {Chernoff} dimensionality reduction", journal = j-TKDD, volume = "7", number = "2", pages = "8:1--8:??", month = jul, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2499907.2499911", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:06 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Knowledge discovery from big data demands effective representation of data. However, big data are often characterized by high dimensionality, which makes knowledge discovery more difficult. Many techniques for dimensionality reduction have been proposed, including well-known Fisher's Linear Discriminant Analysis (LDA). However, the Fisher criterion is incapable of dealing with heteroscedasticity in the data. A technique based on the Chernoff criterion for linear dimensionality reduction has been proposed that is capable of exploiting heteroscedastic information in the data. While the Chernoff criterion has been shown to outperform the Fisher's, a clear understanding of its exact behavior is lacking. In this article, we show precisely what can be expected from the Chernoff criterion. In particular, we show that the Chernoff criterion exploits the Fisher and Fukunaga-Koontz transforms in computing its linear discriminants. Furthermore, we show that a recently proposed decomposition of the data space into four subspaces is incomplete. We provide arguments on how to best enrich the decomposition of the data space in order to account for heteroscedasticity in the data. Finally, we provide experimental results validating our theoretical analysis.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Agarwal:2013:ISI, author = "Deepak Agarwal and Rich Caruana and Jian Pei and Ke Wang", title = "Introduction to the {Special Issue ACM SIGKDD 2012}", journal = j-TKDD, volume = "7", number = "3", pages = "9:1--9:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2513092.2513093", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Rakthanmanon:2013:ABD, author = "Thanawin Rakthanmanon and Bilson Campana and Abdullah Mueen and Gustavo Batista and Brandon Westover and Qiang Zhu and Jesin Zakaria and Eamonn Keogh", title = "Addressing Big Data Time Series: Mining Trillions of Time Series Subsequences Under Dynamic Time Warping", journal = j-TKDD, volume = "7", number = "3", pages = "10:1--10:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500489", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Most time series data mining algorithms use similarity search as a core subroutine, and thus the time taken for similarity search is the bottleneck for virtually all time series data mining algorithms, including classification, clustering, motif discovery, anomaly detection, and so on. The difficulty of scaling a search to large datasets explains to a great extent why most academic work on time series data mining has plateaued at considering a few millions of time series objects, while much of industry and science sits on billions of time series objects waiting to be explored. In this work we show that by using a combination of four novel ideas we can search and mine massive time series for the first time. We demonstrate the following unintuitive fact: in large datasets we can exactly search under Dynamic Time Warping (DTW) much more quickly than the current state-of-the-art Euclidean distance search algorithms. We demonstrate our work on the largest set of time series experiments ever attempted. In particular, the largest dataset we consider is larger than the combined size of all of the time series datasets considered in all data mining papers ever published. We explain how our ideas allow us to solve higher-level time series data mining problems such as motif discovery and clustering at scales that would otherwise be untenable. Moreover, we show how our ideas allow us to efficiently support the uniform scaling distance measure, a measure whose utility seems to be underappreciated, but which we demonstrate here. In addition to mining massive datasets with up to one trillion datapoints, we will show that our ideas also have implications for real-time monitoring of data streams, allowing us to handle much faster arrival rates and/or use cheaper and lower powered devices than are currently possible.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Sun:2013:PIM, author = "Yizhou Sun and Brandon Norick and Jiawei Han and Xifeng Yan and Philip S. Yu and Xiao Yu", title = "{PathSelClus}: Integrating Meta-Path Selection with User-Guided Object Clustering in Heterogeneous Information Networks", journal = j-TKDD, volume = "7", number = "3", pages = "11:1--11:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500492", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Real-world, multiple-typed objects are often interconnected, forming heterogeneous information networks. A major challenge for link-based clustering in such networks is their potential to generate many different results, carrying rather diverse semantic meanings. In order to generate desired clustering, we propose to use meta-path, a path that connects object types via a sequence of relations, to control clustering with distinct semantics. Nevertheless, it is easier for a user to provide a few examples (seeds) than a weighted combination of sophisticated meta-paths to specify her clustering preference. Thus, we propose to integrate meta-path selection with user-guided clustering to cluster objects in networks, where a user first provides a small set of object seeds for each cluster as guidance. Then the system learns the weight for each meta-path that is consistent with the clustering result implied by the guidance, and generates clusters under the learned weights of meta-paths. A probabilistic approach is proposed to solve the problem, and an effective and efficient iterative algorithm, PathSelClus, is proposed to learn the model, where the clustering quality and the meta-path weights mutually enhance each other. Our experiments with several clustering tasks in two real networks and one synthetic network demonstrate the power of the algorithm in comparison with the baselines.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bellare:2013:ASE, author = "Kedar Bellare and Suresh Iyengar and Aditya Parameswaran and Vibhor Rastogi", title = "Active Sampling for Entity Matching with Guarantees", journal = j-TKDD, volume = "7", number = "3", pages = "12:1--12:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500490", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In entity matching, a fundamental issue while training a classifier to label pairs of entities as either duplicates or nonduplicates is the one of selecting informative training examples. Although active learning presents an attractive solution to this problem, previous approaches minimize the misclassification rate (0--1 loss) of the classifier, which is an unsuitable metric for entity matching due to class imbalance (i.e., many more nonduplicate pairs than duplicate pairs). To address this, a recent paper [Arasu et al. 2010] proposes to maximize recall of the classifier under the constraint that its precision should be greater than a specified threshold. However, the proposed technique requires the labels of all n input pairs in the worst case. Our main result is an active learning algorithm that approximately maximizes recall of the classifier while respecting a precision constraint with provably sublinear label complexity (under certain distributional assumptions). Our algorithm uses as a black box any active learning module that minimizes 0--1 loss. We show that label complexity of our algorithm is at most log n times the label complexity of the black box, and also bound the difference in the recall of classifier learnt by our algorithm and the recall of the optimal classifier satisfying the precision constraint. We provide an empirical evaluation of our algorithm on several real-world matching data sets that demonstrates the effectiveness of our approach.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chattopadhyay:2013:BMA, author = "Rita Chattopadhyay and Zheng Wang and Wei Fan and Ian Davidson and Sethuraman Panchanathan and Jieping Ye", title = "Batch Mode Active Sampling Based on Marginal Probability Distribution Matching", journal = j-TKDD, volume = "7", number = "3", pages = "13:1--13:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2513092.2513094", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Active Learning is a machine learning and data mining technique that selects the most informative samples for labeling and uses them as training data; it is especially useful when there are large amount of unlabeled data and labeling them is expensive. Recently, batch-mode active learning, where a set of samples are selected concurrently for labeling, based on their collective merit, has attracted a lot of attention. The objective of batch-mode active learning is to select a set of informative samples so that a classifier learned on these samples has good generalization performance on the unlabeled data. Most of the existing batch-mode active learning methodologies try to achieve this by selecting samples based on certain criteria. In this article we propose a novel criterion which achieves good generalization performance of a classifier by specifically selecting a set of query samples that minimize the difference in distribution between the labeled and the unlabeled data, after annotation. We explicitly measure this difference based on all candidate subsets of the unlabeled data and select the best subset. The proposed objective is an NP-hard integer programming optimization problem. We provide two optimization techniques to solve this problem. In the first one, the problem is transformed into a convex quadratic programming problem and in the second method the problem is transformed into a linear programming problem. Our empirical studies using publicly available UCI datasets and two biomedical image databases demonstrate the effectiveness of the proposed approach in comparison with the state-of-the-art batch-mode active learning methods. We also present two extensions of the proposed approach, which incorporate uncertainty of the predicted labels of the unlabeled data and transfer learning in the proposed formulation. In addition, we present a joint optimization framework for performing both transfer and active learning simultaneously unlike the existing approaches of learning in two separate stages, that is, typically, transfer learning followed by active learning. We specifically minimize a common objective of reducing distribution difference between the domain adapted source, the queried and labeled samples and the rest of the unlabeled target domain data. Our empirical studies on two biomedical image databases and on a publicly available 20 Newsgroups dataset show that incorporation of uncertainty information and transfer learning further improves the performance of the proposed active learning based classifier. Our empirical studies also show that the proposed transfer-active method based on the joint optimization framework performs significantly better than a framework which implements transfer and active learning in two separate stages.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Briggs:2013:IAM, author = "Forrest Briggs and Xiaoli Z. Fern and Raviv Raich and Qi Lou", title = "Instance Annotation for Multi-Instance Multi-Label Learning", journal = j-TKDD, volume = "7", number = "3", pages = "14:1--14:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2500491", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multi-instance multi-label learning (MIML) is a framework for supervised classification where the objects to be classified are bags of instances associated with multiple labels. For example, an image can be represented as a bag of segments and associated with a list of objects it contains. Prior work on MIML has focused on predicting label sets for previously unseen bags. We instead consider the problem of predicting instance labels while learning from data labeled only at the bag level. We propose a regularized rank-loss objective designed for instance annotation, which can be instantiated with different aggregation models connecting instance-level labels with bag-level label sets. The aggregation models that we consider can be factored as a linear function of a ``support instance'' for each class, which is a single feature vector representing a whole bag. Hence we name our proposed methods rank-loss Support Instance Machines (SIM). We propose two optimization methods for the rank-loss objective, which is nonconvex. One is a heuristic method that alternates between updating support instances, and solving a convex problem in which the support instances are treated as constant. The other is to apply the constrained concave-convex procedure (CCCP), which can also be interpreted as iteratively updating support instances and solving a convex problem. To solve the convex problem, we employ the Pegasos framework of primal subgradient descent, and prove that it finds an $ \epsilon $-suboptimal solution in runtime that is linear in the number of bags, instances, and $ 1 / \epsilon $. Additionally, we suggest a method of extending the linear learning algorithm to nonlinear classification, without increasing the runtime asymptotically. Experiments on artificial and real-world datasets including images and audio show that the proposed methods achieve higher accuracy than other loss functions used in prior work, e.g., Hamming loss, and recent work in ambiguous label classification.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ji:2013:PFR, author = "Ming Ji and Binbin Lin and Xiaofei He and Deng Cai and Jiawei Han", title = "Parallel Field Ranking", journal = j-TKDD, volume = "7", number = "3", pages = "15:1--15:??", month = sep, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2513092.2513096", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:07 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recently, ranking data with respect to the intrinsic geometric structure (manifold ranking) has received considerable attentions, with encouraging performance in many applications in pattern recognition, information retrieval and recommendation systems. Most of the existing manifold ranking methods focus on learning a ranking function that varies smoothly along the data manifold. However, beyond smoothness, a desirable ranking function should vary monotonically along the geodesics of the data manifold, such that the ranking order along the geodesics is preserved. In this article, we aim to learn a ranking function that varies linearly and therefore monotonically along the geodesics of the data manifold. Recent theoretical work shows that the gradient field of a linear function on the manifold has to be a parallel vector field. Therefore, we propose a novel ranking algorithm on the data manifolds, called Parallel Field Ranking. Specifically, we try to learn a ranking function and a vector field simultaneously. We require the vector field to be close to the gradient field of the ranking function, and the vector field to be as parallel as possible. Moreover, we require the value of the ranking function at the query point to be the highest, and then decrease linearly along the manifold. Experimental results on both synthetic data and real data demonstrate the effectiveness of our proposed algorithm.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Adali:2013:IPR, author = "Sibel Adali and Malik Magdon-Ismail and Xiaohui Lu", title = "{iHypR}: Prominence ranking in networks of collaborations with hyperedges 1", journal = j-TKDD, volume = "7", number = "4", pages = "16:1--16:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541269", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present a new algorithm called iHypR for computing prominence of actors in social networks of collaborations. Our algorithm builds on the assumption that prominent actors collaborate on prominent objects, and prominent objects are naturally grouped into prominent clusters or groups (hyperedges in a graph). iHypR makes use of the relationships between actors, objects, and hyperedges to compute a global prominence score for the actors in the network. We do not assume the hyperedges are given in advance. Hyperedges computed by our method can perform as well or even better than ``true'' hyperedges. Our algorithm is customized for networks of collaborations, but it is generally applicable without further tuning. We show, through extensive experimentation with three real-life data sets and multiple external measures of prominence, that our algorithm outperforms existing well-known algorithms. Our work is the first to offer such an extensive evaluation. We show that unlike most existing algorithms, the performance is robust across multiple measures of performance. Further, we give a detailed study of the sensitivity of our algorithm to different data sets and the design choices within the algorithm that a user may wish to change. Our article illustrates the various trade-offs that must be considered in computing prominence in collaborative social networks.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2013:STP, author = "Jin Huang and Feiping Nie and Heng Huang and Yi-Cheng Tu and Yu Lei", title = "Social trust prediction using heterogeneous networks", journal = j-TKDD, volume = "7", number = "4", pages = "17:1--17:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541270", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Along with increasing popularity of social websites, online users rely more on the trustworthiness information to make decisions, extract and filter information, and tag and build connections with other users. However, such social network data often suffer from severe data sparsity and are not able to provide users with enough information. Therefore, trust prediction has emerged as an important topic in social network research. Traditional approaches are primarily based on exploring trust graph topology itself. However, research in sociology and our life experience suggest that people who are in the same social circle often exhibit similar behaviors and tastes. To take advantage of the ancillary information for trust prediction, the challenge then becomes what to transfer and how to transfer. In this article, we address this problem by aggregating heterogeneous social networks and propose a novel joint social networks mining (JSNM) method. Our new joint learning model explores the user-group-level similarity between correlated graphs and simultaneously learns the individual graph structure; therefore, the shared structures and patterns from multiple social networks can be utilized to enhance the prediction tasks. As a result, we not only improve the trust prediction in the target graph but also facilitate other information retrieval tasks in the auxiliary graphs. To optimize the proposed objective function, we use the alternative technique to break down the objective function into several manageable subproblems. We further introduce the auxiliary function to solve the optimization problems with rigorously proved convergence. The extensive experiments have been conducted on both synthetic and real- world data. All empirical results demonstrate the effectiveness of our method.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Guzzo:2013:SIF, author = "Antonella Guzzo and Luigi Moccia and Domenico Sacc{\`a} and Edoardo Serra", title = "Solving inverse frequent itemset mining with infrequency constraints via large-scale linear programs", journal = j-TKDD, volume = "7", number = "4", pages = "18:1--18:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541271", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Inverse frequent set mining (IFM) is the problem of computing a transaction database D satisfying given support constraints for some itemsets, which are typically the frequent ones. This article proposes a new formulation of IFM, called IFM$_I$ (IFM with infrequency constraints), where the itemsets that are not listed as frequent are constrained to be infrequent; that is, they must have a support less than or equal to a specified unique threshold. An instance of IFM$_I$ can be seen as an instance of the original IFM by making explicit the infrequency constraints for the minimal infrequent itemsets, corresponding to the so-called negative generator border defined in the literature. The complexity increase from PSPACE (complexity of IFM) to NEXP (complexity of IFM$_I$) is caused by the cardinality of the negative generator border, which can be exponential in the original input size. Therefore, the article introduces a specific problem parameter $ \kappa $ that computes an upper bound to this cardinality using a hypergraph interpretation for which minimal infrequent itemsets correspond to minimal transversals. By fixing a constant k, the article formulates a $k$-bounded definition of the problem, called $k$-IFM$_I$, that collects all instances for which the value of the parameter $ \kappa $ is less than or equal to $k$-its complexity is in PSPACE as for IFM. The bounded problem is encoded as an integer linear program with a large number of variables (actually exponential w.r.t. the number of constraints), which is thereafter approximated by relaxing integer constraints-the decision problem of solving the linear program is proven to be in NP. In order to solve the linear program, a column generation technique is used that is a variation of the simplex method designed to solve large-scale linear programs, in particular with a huge number of variables. The method at each step requires the solution of an auxiliary integer linear program, which is proven to be NP hard in this case and for which a greedy heuristic is presented. The resulting overall column generation solution algorithm enjoys very good scaling as evidenced by the intensive experimentation, thereby paving the way for its application in real-life scenarios.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Balcazar:2013:FCP, author = "Jos{\'e} L. Balc{\'a}zar", title = "Formal and computational properties of the confidence boost of association rules", journal = j-TKDD, volume = "7", number = "4", pages = "19:1--19:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541272", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Some existing notions of redundancy among association rules allow for a logical-style characterization and lead to irredundant bases of absolutely minimum size. We push the intuition of redundancy further to find an intuitive notion of novelty of an association rule, with respect to other rules. Namely, an irredundant rule is so because its confidence is higher than what the rest of the rules would suggest; then, one can ask: how much higher? We propose to measure such a sort of novelty through the confidence boost of a rule. Acting as a complement to confidence and support, the confidence boost helps to obtain small and crisp sets of mined association rules and solves the well-known problem that, in certain cases, rules of negative correlation may pass the confidence bound. We analyze the properties of two versions of the notion of confidence boost, one of them a natural generalization of the other. We develop algorithms to filter rules according to their confidence boost, compare the concept to some similar notions in the literature, and describe the results of some experimentation employing the new notions on standard benchmark datasets. We describe an open source association mining tool that embodies one of our variants of confidence boost in such a way that the data mining process does not require the user to select any value for any parameter.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ang:2013:CPN, author = "Hock Hee Ang and Vivekanand Gopalkrishnan and Steven C. H. Hoi and Wee Keong Ng", title = "Classification in {P2P} networks with cascade support vector machines", journal = j-TKDD, volume = "7", number = "4", pages = "20:1--20:??", month = nov, year = "2013", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2541268.2541273", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:09 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Classification in Peer-to-Peer (P2P) networks is important to many real applications, such as distributed intrusion detection, distributed recommendation systems, and distributed antispam detection. However, it is very challenging to perform classification in P2P networks due to many practical issues, such as scalability, peer dynamism, and asynchronism. This article investigates the practical techniques of constructing Support Vector Machine (SVM) classifiers in the P2P networks. In particular, we demonstrate how to efficiently cascade SVM in a P2P network with the use of reduced SVM. In addition, we propose to fuse the concept of cascade SVM with bootstrap aggregation to effectively balance the trade-off between classification accuracy, model construction, and prediction cost. We provide theoretical insights for the proposed solutions and conduct an extensive set of empirical studies on a number of large-scale datasets. Encouraging results validate the efficacy of the proposed approach.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2014:ISI, author = "Wei Chen and Jie Tang", title = "Introduction to special issue on computational aspects of social and information networks: Theory, methodologies, and applications {(TKDD-CASIN)}", journal = j-TKDD, volume = "8", number = "1", pages = "1:1--1:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556608", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yang:2014:USN, author = "Zhi Yang and Christo Wilson and Xiao Wang and Tingting Gao and Ben Y. Zhao and Yafei Dai", title = "Uncovering social network {Sybils} in the wild", journal = j-TKDD, volume = "8", number = "1", pages = "2:1--2:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556609", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Sybil accounts are fake identities created to unfairly increase the power or resources of a single malicious user. Researchers have long known about the existence of Sybil accounts in online communities such as file-sharing systems, but they have not been able to perform large-scale measurements to detect them or measure their activities. In this article, we describe our efforts to detect, characterize, and understand Sybil account activity in the Renren Online Social Network (OSN). We use ground truth provided by Renren Inc. to build measurement-based Sybil detectors and deploy them on Renren to detect more than 100,000 Sybil accounts. Using our full dataset of 650,000 Sybils, we examine several aspects of Sybil behavior. First, we study their link creation behavior and find that contrary to prior conjecture, Sybils in OSNs do not form tight-knit communities. Next, we examine the fine-grained behaviors of Sybils on Renren using clickstream data. Third, we investigate behind-the-scenes collusion between large groups of Sybils. Our results reveal that Sybils with no explicit social ties still act in concert to launch attacks. Finally, we investigate enhanced techniques to identify stealthy Sybils. In summary, our study advances the understanding of Sybil behavior on OSNs and shows that Sybils can effectively avoid existing community-based Sybil detectors. We hope that our results will foster new research on Sybil detection that is based on novel types of Sybil features.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jin:2014:SAR, author = "Ruoming Jin and Victor E. Lee and Longjie Li", title = "Scalable and axiomatic ranking of network role similarity", journal = j-TKDD, volume = "8", number = "1", pages = "3:1--3:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2518176", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A key task in analyzing social networks and other complex networks is role analysis: describing and categorizing nodes according to how they interact with other nodes. Two nodes have the same role if they interact with equivalent sets of neighbors. The most fundamental role equivalence is automorphic equivalence. Unfortunately, the fastest algorithms known for graph automorphism are nonpolynomial. Moreover, since exact equivalence is rare, a more meaningful task is measuring the role similarity between any two nodes. This task is closely related to the structural or link-based similarity problem that SimRank addresses. However, SimRank and other existing similarity measures are not sufficient because they do not guarantee to recognize automorphically or structurally equivalent nodes. This article makes two contributions. First, we present and justify several axiomatic properties necessary for a role similarity measure or metric. Second, we present RoleSim, a new similarity metric that satisfies these axioms and can be computed with a simple iterative algorithm. We rigorously prove that RoleSim satisfies all of these axiomatic properties. We also introduce Iceberg RoleSim, a scalable algorithm that discovers all pairs with RoleSim scores above a user-defined threshold $ \theta $. We demonstrate the interpretative power of RoleSim on both synthetic and real datasets.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mcauley:2014:DSC, author = "Julian Mcauley and Jure Leskovec", title = "Discovering social circles in ego networks", journal = j-TKDD, volume = "8", number = "1", pages = "4:1--4:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556612", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "People's personal social networks are big and cluttered, and currently there is no good way to automatically organize them. Social networking sites allow users to manually categorize their friends into social circles (e.g., ``circles'' on Google+, and ``lists'' on Facebook and Twitter). However, circles are laborious to construct and must be manually updated whenever a user's network grows. In this article, we study the novel task of automatically identifying users' social circles. We pose this task as a multimembership node clustering problem on a user's ego network, a network of connections between her friends. We develop a model for detecting circles that combines network structure as well as user profile information. For each circle, we learn its members and the circle-specific user profile similarity metric. Modeling node membership to multiple circles allows us to detect overlapping as well as hierarchically nested circles. Experiments show that our model accurately identifies circles on a diverse set of data from Facebook, Google+, and Twitter, for all of which we obtain hand-labeled ground truth.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Abrahao:2014:SFA, author = "Bruno Abrahao and Sucheta Soundarajan and John Hopcroft and Robert Kleinberg", title = "A separability framework for analyzing community structure", journal = j-TKDD, volume = "8", number = "1", pages = "5:1--5:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2527231", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Four major factors govern the intricacies of community extraction in networks: (1) the literature offers a multitude of disparate community detection algorithms whose output exhibits high structural variability across the collection, (2) communities identified by algorithms may differ structurally from real communities that arise in practice, (3) there is no consensus characterizing how to discriminate communities from noncommunities, and (4) the application domain includes a wide variety of networks of fundamentally different natures. In this article, we present a class separability framework to tackle these challenges through a comprehensive analysis of community properties. Our approach enables the assessment of the structural dissimilarity among the output of multiple community detection algorithms and between the output of algorithms and communities that arise in practice. In addition, our method provides us with a way to organize the vast collection of community detection algorithms by grouping those that behave similarly. Finally, we identify the most discriminative graph-theoretical properties of community signature and the small subset of properties that account for most of the biases of the different community detection algorithms. We illustrate our approach with an experimental analysis, which reveals nuances of the structure of real and extracted communities. In our experiments, we furnish our framework with the output of 10 different community detection procedures, representative of categories of popular algorithms available in the literature, applied to a diverse collection of large-scale real network datasets whose domains span biology, online shopping, and social systems. We also analyze communities identified by annotations that accompany the data, which reflect exemplar communities in various domain. We characterize these communities using a broad spectrum of community properties to produce the different structural classes. As our experiments show that community structure is not a universal concept, our framework enables an informed choice of the most suitable community detection method for identifying communities of a specific type in a given network and allows for a comparison of existing community detection algorithms while guiding the design of new ones.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhong:2014:UBL, author = "Erheng Zhong and Wei Fan and Qiang Yang", title = "User behavior learning and transfer in composite social networks", journal = j-TKDD, volume = "8", number = "1", pages = "6:1--6:??", month = feb, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2556613", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Mar 13 09:16:11 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Accurate prediction of user behaviors is important for many social media applications, including social marketing, personalization, and recommendation. A major challenge lies in that although many previous works model user behavior from only historical behavior logs, the available user behavior data or interactions between users and items in a given social network are usually very limited and sparse (e.g., $ \geq 99.9 \% $ empty), which makes models overfit the rare observations and fail to provide accurate predictions. We observe that many people are members of several social networks in the same time, such as Facebook, Twitter, and Tencent's QQ. Importantly, users' behaviors and interests in different networks influence one another. This provides an opportunity to leverage the knowledge of user behaviors in different networks by considering the overlapping users in different networks as bridges, in order to alleviate the data sparsity problem, and enhance the predictive performance of user behavior modeling. Combining different networks ``simply and naively'' does not work well. In this article, we formulate the problem to model multiple networks as ``adaptive composite transfer'' and propose a framework called ComSoc. ComSoc first selects the most suitable networks inside a composite social network via a hierarchical Bayesian model, parameterized for individual users. It then builds topic models for user behavior prediction using both the relationships in the selected networks and related behavior data. With different relational regularization, we introduce different implementations, corresponding to different ways to transfer knowledge from composite social relations. To handle big data, we have implemented the algorithm using Map/Reduce. We demonstrate that the proposed composite network-based user behavior models significantly improve the predictive accuracy over a number of existing approaches on several real-world applications, including a very large social networking dataset from Tencent Inc.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ahmed:2014:NSS, author = "Nesreen K. Ahmed and Jennifer Neville and Ramana Kompella", title = "Network Sampling: From Static to Streaming Graphs", journal = j-TKDD, volume = "8", number = "2", pages = "7:1--7:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601438", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Network sampling is integral to the analysis of social, information, and biological networks. Since many real-world networks are massive in size, continuously evolving, and/or distributed in nature, the network structure is often sampled in order to facilitate study. For these reasons, a more thorough and complete understanding of network sampling is critical to support the field of network science. In this paper, we outline a framework for the general problem of network sampling by highlighting the different objectives, population and units of interest, and classes of network sampling methods. In addition, we propose a spectrum of computational models for network sampling methods, ranging from the traditionally studied model based on the assumption of a static domain to a more challenging model that is appropriate for streaming domains. We design a family of sampling methods based on the concept of graph induction that generalize across the full spectrum of computational models (from static to streaming) while efficiently preserving many of the topological properties of the input graphs. Furthermore, we demonstrate how traditional static sampling algorithms can be modified for graph streams for each of the three main classes of sampling methods: node, edge, and topology-based sampling. Experimental results indicate that our proposed family of sampling methods more accurately preserve the underlying properties of the graph in both static and streaming domains. Finally, we study the impact of network sampling algorithms on the parameter estimation and performance evaluation of relational classification algorithms.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ge:2014:RMA, author = "Yong Ge and Guofei Jiang and Min Ding and Hui Xiong", title = "Ranking Metric Anomaly in Invariant Networks", journal = j-TKDD, volume = "8", number = "2", pages = "8:1--8:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601436", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The management of large-scale distributed information systems relies on the effective use and modeling of monitoring data collected at various points in the distributed information systems. A traditional approach to model monitoring data is to discover invariant relationships among the monitoring data. Indeed, we can discover all invariant relationships among all pairs of monitoring data and generate invariant networks, where a node is a monitoring data source (metric) and a link indicates an invariant relationship between two monitoring data. Such an invariant network representation can help system experts to localize and diagnose the system faults by examining those broken invariant relationships and their related metrics, since system faults usually propagate among the monitoring data and eventually lead to some broken invariant relationships. However, at one time, there are usually a lot of broken links (invariant relationships) within an invariant network. Without proper guidance, it is difficult for system experts to manually inspect this large number of broken links. To this end, in this article, we propose the problem of ranking metrics according to the anomaly levels for a given invariant network, while this is a nontrivial task due to the uncertainties and the complex nature of invariant networks. Specifically, we propose two types of algorithms for ranking metric anomaly by link analysis in invariant networks. Along this line, we first define two measurements to quantify the anomaly level of each metric, and introduce the m Rank algorithm. Also, we provide a weighted score mechanism and develop the g Rank algorithm, which involves an iterative process to obtain a score to measure the anomaly levels. In addition, some extended algorithms based on m Rank and g Rank algorithms are developed by taking into account the probability of being broken as well as noisy links. Finally, we validate all the proposed algorithms on a large number of real-world and synthetic data sets to illustrate the effectiveness and efficiency of different algorithms.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2014:DGP, author = "Gensheng Zhang and Xiao Jiang and Ping Luo and Min Wang and Chengkai Li", title = "Discovering General Prominent Streaks in Sequence Data", journal = j-TKDD, volume = "8", number = "2", pages = "9:1--9:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601439", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article studies the problem of prominent streak discovery in sequence data. Given a sequence of values, a prominent streak is a long consecutive subsequence consisting of only large (small) values, such as consecutive games of outstanding performance in sports, consecutive hours of heavy network traffic, and consecutive days of frequent mentioning of a person in social media. Prominent streak discovery provides insightful data patterns for data analysis in many real-world applications and is an enabling technique for computational journalism. Given its real-world usefulness and complexity, the research on prominent streaks in sequence data opens a spectrum of challenging problems. A baseline approach to finding prominent streaks is a quadratic algorithm that exhaustively enumerates all possible streaks and performs pairwise streak dominance comparison. For more efficient methods, we make the observation that prominent streaks are in fact skyline points in two dimensions-streak interval length and minimum value in the interval. Our solution thus hinges on the idea to separate the two steps in prominent streak discovery: candidate streak generation and skyline operation over candidate streaks. For candidate generation, we propose the concept of local prominent streak (LPS). We prove that prominent streaks are a subset of LPSs and the number of LPSs is less than the length of a data sequence, in comparison with the quadratic number of candidates produced by the brute-force baseline method. We develop efficient algorithms based on the concept of LPS. The nonlinear local prominent streak (NLPS)-based method considers a superset of LPSs as candidates, and the linear local prominent streak (LLPS)-based method further guarantees to consider only LPSs. The proposed properties and algorithms are also extended for discovering general top- k, multisequence, and multidimensional prominent streaks. The results of experiments using multiple real datasets verified the effectiveness of the proposed methods and showed orders of magnitude performance improvement against the baseline method.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Schifanella:2014:MTD, author = "Claudio Schifanella and K. Sel{\c{c}}uk Candan and Maria Luisa Sapino", title = "Multiresolution Tensor Decompositions with Mode Hierarchies", journal = j-TKDD, volume = "8", number = "2", pages = "10:1--10:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2532169", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:22 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Tensors (multidimensional arrays) are widely used for representing high-order dimensional data, in applications ranging from social networks, sensor data, and Internet traffic. Multiway data analysis techniques, in particular tensor decompositions, allow extraction of hidden correlations among multiway data and thus are key components of many data analysis frameworks. Intuitively, these algorithms can be thought of as multiway clustering schemes, which consider multiple facets of the data in identifying clusters, their weights, and contributions of each data element. Unfortunately, algorithms for fitting multiway models are, in general, iterative and very time consuming. In this article, we observe that, in many applications, there is a priori background knowledge (or metadata) about one or more domain dimensions. This metadata is often in the form of a hierarchy that clusters the elements of a given data facet (or mode). We investigate whether such single-mode data hierarchies can be used to boost the efficiency of tensor decomposition process, without significant impact on the final decomposition quality. We consider each domain hierarchy as a guide to help provide higher- or lower-resolution views of the data in the tensor on demand and we rely on these metadata-induced multiresolution tensor representations to develop a multiresolution approach to tensor decomposition. In this article, we focus on an alternating least squares (ALS)--based implementation of the two most important decomposition models such as the PARAllel FACtors (PARAFAC, which decomposes a tensor into a diagonal tensor and a set of factor matrices) and the Tucker (which produces as result a core tensor and a set of dimension-subspaces matrices). Experiment results show that, when the available metadata is used as a rough guide, the proposed multiresolution method helps fit both PARAFAC and Tucker models with consistent (under different parameters settings) savings in execution time and memory consumption, while preserving the quality of the decomposition.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2014:RMN, author = "Jin Huang and Feiping Nie and Heng Huang and Chris Ding", title = "Robust Manifold Nonnegative Matrix Factorization", journal = j-TKDD, volume = "8", number = "3", pages = "11:1--11:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601434", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Nonnegative Matrix Factorization (NMF) has been one of the most widely used clustering techniques for exploratory data analysis. However, since each data point enters the objective function with squared residue error, a few outliers with large errors easily dominate the objective function. In this article, we propose a Robust Manifold Nonnegative Matrix Factorization (RMNMF) method using l$_{2, 1}$ -norm and integrating NMF and spectral clustering under the same clustering framework. We also point out the solution uniqueness issue for the existing NMF methods and propose an additional orthonormal constraint to address this problem. With the new constraint, the conventional auxiliary function approach no longer works. We tackle this difficult optimization problem via a novel Augmented Lagrangian Method (ALM)--based algorithm and convert the original constrained optimization problem on one variable into a multivariate constrained problem. The new objective function then can be decomposed into several subproblems that each has a closed-form solution. More importantly, we reveal the connection of our method with robust K -means and spectral clustering, and we demonstrate its theoretical significance. Extensive experiments have been conducted on nine benchmark datasets, and all empirical results show the effectiveness of our method.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2014:RAL, author = "Yu Zhang and Dit-Yan Yeung", title = "A Regularization Approach to Learning Task Relationships in Multitask Learning", journal = j-TKDD, volume = "8", number = "3", pages = "12:1--12:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2538028", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multitask learning is a learning paradigm that seeks to improve the generalization performance of a learning task with the help of some other related tasks. In this article, we propose a regularization approach to learning the relationships between tasks in multitask learning. This approach can be viewed as a novel generalization of the regularized formulation for single-task learning. Besides modeling positive task correlation, our approach-multitask relationship learning (MTRL)-can also describe negative task correlation and identify outlier tasks based on the same underlying principle. By utilizing a matrix-variate normal distribution as a prior on the model parameters of all tasks, our MTRL method has a jointly convex objective function. For efficiency, we use an alternating method to learn the optimal model parameters for each task as well as the relationships between tasks. We study MTRL in the symmetric multitask learning setting and then generalize it to the asymmetric setting as well. We also discuss some variants of the regularization approach to demonstrate the use of other matrix-variate priors for learning task relationships. Moreover, to gain more insight into our model, we also study the relationships between MTRL and some existing multitask learning methods. Experiments conducted on a toy problem as well as several benchmark datasets demonstrate the effectiveness of MTRL as well as its high interpretability revealed by the task covariance matrix.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lin:2014:SCR, author = "Ming Lin and Shifeng Weng and Changshui Zhang", title = "On the Sample Complexity of Random {Fourier} Features for Online Learning: How Many Random {Fourier} Features Do We Need?", journal = j-TKDD, volume = "8", number = "3", pages = "13:1--13:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2611378", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study the sample complexity of random Fourier features for online kernel learning-that is, the number of random Fourier features required to achieve good generalization performance. We show that when the loss function is strongly convex and smooth, online kernel learning with random Fourier features can achieve an $ O (l o g T / T) $ bound for the excess risk with only $ O (1 / \lambda^2)$ random Fourier features, where T is the number of training examples and \lambda is the modulus of strong convexity. This is a significant improvement compared to the existing result for batch kernel learning that requires $ O(T)$ random Fourier features to achieve a generalization bound $ O(1 / \sqrt T)$. Our empirical study verifies that online kernel learning with a limited number of random Fourier features can achieve similar generalization performance as online learning using full kernel matrix. We also present an enhanced online learning algorithm with random Fourier features that improves the classification performance by multiple passes of training examples and a partial average.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Eyal:2014:PIM, author = "Ron Eyal and Avi Rosenfeld and Sigal Sina and Sarit Kraus", title = "Predicting and Identifying Missing Node Information in Social Networks", journal = j-TKDD, volume = "8", number = "3", pages = "14:1--14:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2536775", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:23 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In recent years, social networks have surged in popularity. One key aspect of social network research is identifying important missing information that is not explicitly represented in the network, or is not visible to all. To date, this line of research typically focused on finding the connections that are missing between nodes, a challenge typically termed as the link prediction problem. This article introduces the missing node identification problem, where missing members in the social network structure must be identified. In this problem, indications of missing nodes are assumed to exist. Given these indications and a partial network, we must assess which indications originate from the same missing node and determine the full network structure. Toward solving this problem, we present the missing node identification by spectral clustering algorithm (MISC), an approach based on a spectral clustering algorithm, combined with nodes' pairwise affinity measures that were adopted from link prediction research. We evaluate the performance of our approach in different problem settings and scenarios, using real-life data from Facebook. The results show that our approach has beneficial results and can be effective in solving the missing node identification problem. In addition, this article also presents R-MISC, which uses a sparse matrix representation, efficient algorithms for calculating the nodes' pairwise affinity, and a proprietary dimension reduction technique to enable scaling the MISC algorithm to large networks of more than 100,000 nodes. Last, we consider problem settings where some of the indications are unknown. Two algorithms are suggested for this problem: speculative MISC, based on MISC, and missing link completion, based on classical link prediction literature. We show that speculative MISC outperforms missing link completion.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Webb:2014:EDM, author = "Geoffrey I. Webb and Jilles Vreeken", title = "Efficient Discovery of the Most Interesting Associations", journal = j-TKDD, volume = "8", number = "3", pages = "15:1--15:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601433", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Jun 26 05:48:23 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Self-sufficient itemsets have been proposed as an effective approach to summarizing the key associations in data. However, their computation appears highly demanding, as assessing whether an itemset is self-sufficient requires consideration of all pairwise partitions of the itemset into pairs of subsets as well as consideration of all supersets. This article presents the first published algorithm for efficiently discovering self-sufficient itemsets. This branch-and-bound algorithm deploys two powerful pruning mechanisms based on upper bounds on itemset value and statistical significance level. It demonstrates that finding top- k productive and nonredundant itemsets, with postprocessing to identify those that are not independently productive, can efficiently identify small sets of key associations. We present extensive evaluation of the strengths and limitations of the technique, including comparisons with alternative approaches to finding the most interesting associations.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Shabtai:2014:ODM, author = "Asaf Shabtai and Maya Bercovitch and Lior Rokach and Yuval Elovici", title = "Optimizing Data Misuse Detection", journal = j-TKDD, volume = "8", number = "3", pages = "16:1--16:??", month = jun, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2611520", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jun 3 13:50:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Data misuse may be performed by entities such as an organization's employees and business partners who are granted access to sensitive information and misuse their privileges. We assume that users can be either trusted or untrusted. The access of untrusted parties to data objects (e.g., client and patient records) should be monitored in an attempt to detect misuse. However, monitoring data objects is resource intensive and time-consuming and may also cause disturbance or inconvenience to the involved employees. Therefore, the monitored data objects should be carefully selected. In this article, we present two optimization problems carefully designed for selecting specific data objects for monitoring, such that the detection rate is maximized and the monitoring effort is minimized. In the first optimization problem, the goal is to select data objects for monitoring that are accessed by at most c trusted agents while ensuring access to at least k monitored objects by each untrusted agent (both c and k are integer variable). As opposed to the first optimization problem, the goal of the second optimization problem is to select monitored data objects that maximize the number of monitored data objects accessed by untrusted agents while ensuring that each trusted agent does not access more than d monitored data objects (d is an integer variable as well). Two efficient heuristic algorithms for solving these optimization problems are proposed, and experiments were conducted simulating different scenarios to evaluate the algorithms' performance. Moreover, we compared the heuristic algorithms' performance to the optimal solution and conducted sensitivity analysis on the three parameters (c, k, and d) and on the ratio between the trusted and untrusted agents.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Hernandez-Orallo:2014:PRC, author = "Jos{\'e} Hern{\'a}ndez-Orallo", title = "Probabilistic Reframing for Cost-Sensitive Regression", journal = j-TKDD, volume = "8", number = "4", pages = "17:1--17:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641758", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:02 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Common-day applications of predictive models usually involve the full use of the available contextual information. When the operating context changes, one may fine-tune the by-default (incontextual) prediction or may even abstain from predicting a value (a reject). Global reframing solutions, where the same function is applied to adapt the estimated outputs to a new cost context, are possible solutions here. An alternative approach, which has not been studied in a comprehensive way for regression in the knowledge discovery and data mining literature, is the use of a local (e.g., probabilistic) reframing approach, where decisions are made according to the estimated output and a reliability, confidence, or probability estimation. In this article, we advocate for a simple two-parameter (mean and variance) approach, working with a normal conditional probability density. Given the conditional mean produced by any regression technique, we develop lightweight ``enrichment'' methods that produce good estimates of the conditional variance, which are used by the probabilistic (local) reframing methods. We apply these methods to some very common families of cost-sensitive problems, such as optimal predictions in (auction) bids, asymmetric loss scenarios, and rejection rules.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Miettinen:2014:MMD, author = "Pauli Miettinen and Jilles Vreeken", title = "{MDL4BMF}: Minimum Description Length for {Boolean} Matrix Factorization", journal = j-TKDD, volume = "8", number = "4", pages = "18:1--18:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601437", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Matrix factorizations-where a given data matrix is approximated by a product of two or more factor matrices-are powerful data mining tools. Among other tasks, matrix factorizations are often used to separate global structure from noise. This, however, requires solving the ``model order selection problem'' of determining the proper rank of the factorization, that is, to answer where fine-grained structure stops, and where noise starts. Boolean Matrix Factorization (BMF)-where data, factors, and matrix product are Boolean-has in recent years received increased attention from the data mining community. The technique has desirable properties, such as high interpretability and natural sparsity. Yet, so far no method for selecting the correct model order for BMF has been available. In this article, we propose the use of the Minimum Description Length (MDL) principle for this task. Besides solving the problem, this well-founded approach has numerous benefits; for example, it is automatic, does not require a likelihood function, is fast, and, as experiments show, is highly accurate. We formulate the description length function for BMF in general-making it applicable for any BMF algorithm. We discuss how to construct an appropriate encoding: starting from a simple and intuitive approach, we arrive at a highly efficient data-to-model--based encoding for BMF. We extend an existing algorithm for BMF to use MDL to identify the best Boolean matrix factorization, analyze the complexity of the problem, and perform an extensive experimental evaluation to study its behavior.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tang:2014:FSS, author = "Jiliang Tang and Huan Liu", title = "Feature Selection for Social Media Data", journal = j-TKDD, volume = "8", number = "4", pages = "19:1--19:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629587", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Feature selection is widely used in preparing high-dimensional data for effective data mining. The explosive popularity of social media produces massive and high-dimensional data at an unprecedented rate, presenting new challenges to feature selection. Social media data consists of (1) traditional high-dimensional, attribute-value data such as posts, tweets, comments, and images, and (2) linked data that provides social context for posts and describes the relationships between social media users as well as who generates the posts, and so on. The nature of social media also determines that its data is massive, noisy, and incomplete, which exacerbates the already challenging problem of feature selection. In this article, we study a novel feature selection problem of selecting features for social media data with its social context. In detail, we illustrate the differences between attribute-value data and social media data, investigate if linked data can be exploited in a new feature selection framework by taking advantage of social science theories. We design and conduct experiments on datasets from real-world social media Web sites, and the empirical results demonstrate that the proposed framework can significantly improve the performance of feature selection. Further experiments are conducted to evaluate the effects of user--user and user--post relationships manifested in linked data on feature selection, and research issues for future work will be discussed.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Riondato:2014:EDA, author = "Matteo Riondato and Eli Upfal", title = "Efficient Discovery of Association Rules and Frequent Itemsets through Sampling with Tight Performance Guarantees", journal = j-TKDD, volume = "8", number = "4", pages = "20:1--20:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629586", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The tasks of extracting (top- K ) Frequent Itemsets (FIs) and Association Rules (ARs) are fundamental primitives in data mining and database applications. Exact algorithms for these problems exist and are widely used, but their running time is hindered by the need of scanning the entire dataset, possibly multiple times. High-quality approximations of FIs and ARs are sufficient for most practical uses. Sampling techniques can be used for fast discovery of approximate solutions, but works exploring this technique did not provide satisfactory performance guarantees on the quality of the approximation due to the difficulty of bounding the probability of under- or oversampling any one of an unknown number of frequent itemsets. We circumvent this issue by applying the statistical concept of Vapnik--Chervonenkis (VC) dimension to develop a novel technique for providing tight bounds on the sample size that guarantees approximation of the (top- K ) FIs and ARs within user-specified parameters. The resulting sample size is linearly dependent on the VC-dimension of a range space associated with the dataset. We analyze the VC-dimension of this range space and show that it is upper bounded by an easy-to-compute characteristic quantity of the dataset, the d-index, namely, the maximum integer d such that the dataset contains at least d transactions of length at least d such that no one of them is a superset of or equal to another. We show that this bound is tight for a large class of datasets. The resulting sample size is a significant improvement over previous known results. We present an extensive experimental evaluation of our technique on real and artificial datasets, demonstrating the practicality of our methods, and showing that they achieve even higher quality approximations than what is guaranteed by the analysis.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Burton:2014:DSC, author = "Scott H. Burton and Christophe G. Giraud-Carrier", title = "Discovering Social Circles in Directed Graphs", journal = j-TKDD, volume = "8", number = "4", pages = "21:1--21:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641759", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:02 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We examine the problem of identifying social circles, or sets of cohesive and mutually aware nodes surrounding an initial query set, in directed graphs where the complete graph is not known beforehand. This problem differs from local community mining, in that the query set defines the circle of interest. We explicitly handle edge direction, as in many cases relationships are not symmetric, and focus on the local context because many real-world graphs cannot be feasibly known. We outline several issues that are unique to this context, introduce a quality function to measure the value of including a particular node in an emerging social circle, and describe a greedy social circle discovery algorithm. We demonstrate the effectiveness of this approach on artificial benchmarks, large networks with topical community labels, and several real-world case studies.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Paul:2014:RPL, author = "Saurabh Paul and Christos Boutsidis and Malik Magdon-Ismail and Petros Drineas", title = "Random Projections for Linear Support Vector Machines", journal = j-TKDD, volume = "8", number = "4", pages = "22:1--22:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641760", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:45:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Let $X$ be a data matrix of rank $ \rho $, whose rows represent $n$ points in $d$-dimensional space. The linear support vector machine constructs a hyperplane separator that maximizes the 1-norm soft margin. We develop a new oblivious dimension reduction technique that is precomputed and can be applied to any input matrix $X$. We prove that, with high probability, the margin and minimum enclosing ball in the feature space are preserved to within $ \epsilon $-relative error, ensuring comparable generalization as in the original space in the case of classification. For regression, we show that the margin is preserved to $ \epsilon $-relative error with high probability. We present extensive experiments with real and synthetic data to support our theory.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Erdo:2014:RGN, author = "D{\'o}ra Erd{\H{o}}s and Rainer Gemulla and Evimaria Terzi", title = "Reconstructing Graphs from Neighborhood Data", journal = j-TKDD, volume = "8", number = "4", pages = "23:1--23:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641761", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:02 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Consider a social network and suppose that we are only given the number of common friends between each pair of users. Can we reconstruct the underlying network? Similarly, consider a set of documents and the words that appear in them. If we only know the number of common words for every pair of documents, as well as the number of common documents for every pair of words, can we infer which words appear in which documents? In this article, we develop a general methodology for answering questions like these. We formalize these questions in what we call the {\em R}econstruct problem: given information about the common neighbors of nodes in a network, our goal is to reconstruct the hidden binary matrix that indicates the presence or absence of relationships between individual nodes. In fact, we propose two different variants of this problem: one where the number of connections of every node (i.e., the degree of every node) is known and a second one where it is unknown. We call these variants the degree-aware and the degree-oblivious versions of the Reconstruct problem, respectively. Our algorithms for both variants exploit the properties of the singular value decomposition of the hidden binary matrix. More specifically, we show that using the available neighborhood information, we can reconstruct the hidden matrix by finding the components of its singular value decomposition and then combining them appropriately. Our extensive experimental study suggests that our methods are able to reconstruct binary matrices of different characteristics with up to 100\% accuracy.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Acharya:2014:OFC, author = "Ayan Acharya and Eduardo R. Hruschka and Joydeep Ghosh and Sreangsu Acharyya", title = "An Optimization Framework for Combining Ensembles of Classifiers and Clusterers with Applications to Nontransductive Semisupervised Learning and Transfer Learning", journal = j-TKDD, volume = "9", number = "1", pages = "1:1--1:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2601435", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Unsupervised models can provide supplementary soft constraints to help classify new ``target'' data because similar instances in the target set are more likely to share the same class label. Such models can also help detect possible differences between training and target distributions, which is useful in applications where concept drift may take place, as in transfer learning settings. This article describes a general optimization framework that takes as input class membership estimates from existing classifiers learned on previously encountered ``source'' (or training) data, as well as a similarity matrix from a cluster ensemble operating solely on the target (or test) data to be classified, and yields a consensus labeling of the target data. More precisely, the application settings considered are nontransductive semisupervised and transfer learning scenarios where the training data are used only to build an ensemble of classifiers and are subsequently discarded before classifying the target data. The framework admits a wide range of loss functions and classification/clustering methods. It exploits properties of Bregman divergences in conjunction with Legendre duality to yield a principled and scalable approach. A variety of experiments show that the proposed framework can yield results substantially superior to those provided by na{\"\i}vely applying classifiers learned on the original task to the target data. In addition, we show that the proposed approach, even not being conceptually transductive, can provide better results compared to some popular transductive learning techniques.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Boedihardjo:2014:FEL, author = "Arnold P. Boedihardjo and Chang-Tien Lu and Bingsheng Wang", title = "A Framework for Exploiting Local Information to Enhance Density Estimation of Data Streams", journal = j-TKDD, volume = "9", number = "1", pages = "2:1--2:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629618", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The Probability Density Function (PDF) is the fundamental data model for a variety of stream mining algorithms. Existing works apply the standard nonparametric Kernel Density Estimator (KDE) to approximate the PDF of data streams. As a result, the stream-based KDEs cannot accurately capture complex local density features. In this article, we propose the use of Local Region (LRs) to model local density information in univariate data streams. In-depth theoretical analyses are presented to justify the effectiveness of the LR-based KDE. Based on the analyses, we develop the General Local rEgion AlgorithM (GLEAM) to enhance the estimation quality of structurally complex univariate distributions for existing stream-based KDEs. A set of algorithmic optimizations is designed to improve the query throughput of GLEAM and to achieve its linear order computation. Additionally, a comprehensive suite of experiments was conducted to test the effectiveness and efficiency of GLEAM.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ordonez:2014:BVS, author = "Carlos Ordonez and Carlos Garcia-Alvarado and Veerabhadaran Baladandayuthapani", title = "{Bayesian} Variable Selection in Linear Regression in One Pass for Large Datasets", journal = j-TKDD, volume = "9", number = "1", pages = "3:1--3:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629617", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Bayesian models are generally computed with Markov Chain Monte Carlo (MCMC) methods. The main disadvantage of MCMC methods is the large number of iterations they need to sample the posterior distributions of model parameters, especially for large datasets. On the other hand, variable selection remains a challenging problem due to its combinatorial search space, where Bayesian models are a promising solution. In this work, we study how to accelerate Bayesian model computation for variable selection in linear regression. We propose a fast Gibbs sampler algorithm, a widely used MCMC method that incorporates several optimizations. We use a Zellner prior for the regression coefficients, an improper prior on variance, and a conjugate prior Gaussian distribution, which enable dataset summarization in one pass, thus exploiting an augmented set of sufficient statistics. Thereafter, the algorithm iterates in main memory. Sufficient statistics are indexed with a sparse binary vector to efficiently compute matrix projections based on selected variables. Discovered variable subsets probabilities, selecting and discarding each variable, are stored on a hash table for fast retrieval in future iterations. We study how to integrate our algorithm into a Database Management System (DBMS), exploiting aggregate User-Defined Functions for parallel data summarization and stored procedures to manipulate matrices with arrays. An experimental evaluation with real datasets evaluates accuracy and time performance, comparing our DBMS-based algorithm with the R package. Our algorithm is shown to produce accurate results, scale linearly on dataset size, and run orders of magnitude faster than the R package.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Fei:2014:SSB, author = "Hongliang Fei and Jun Huan", title = "Structured Sparse Boosting for Graph Classification", journal = j-TKDD, volume = "9", number = "1", pages = "4:1--4:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629328", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Boosting is a highly effective algorithm that produces a linear combination of weak classifiers (a.k.a. base learners) to obtain high-quality classification models. In this article, we propose a generalized logit boost algorithm in which base learners have structural relationships in the functional space. Although such relationships are generic, our work is particularly motivated by the emerging topic of pattern-based classification for semistructured data including graphs. Toward an efficient incorporation of the structure information, we have designed a general model in which we use an undirected graph to capture the relationship of subgraph-based base learners. In our method, we employ both L$_1$ and Laplacian-based L$_2$ regularization to logit boosting to achieve model sparsity and smoothness in the functional space spanned by the base learners. We have derived efficient optimization algorithms based on coordinate descent for the new boosting formulation and theoretically prove that it exhibits a natural grouping effect for nearby spatial or overlapping base learners and that the resulting estimator is consistent. Additionally, motivated by the connection between logit boosting and logistic regression, we extend our structured sparse regularization framework to logistic regression for vectorial data in which features are structured. Using comprehensive experimental study and comparing our work with the state-of-the-art, we have demonstrated the effectiveness of the proposed learning method.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xu:2014:GGB, author = "Zhiqiang Xu and Yiping Ke and Yi Wang and Hong Cheng and James Cheng", title = "{GBAGC}: a General {Bayesian} Framework for Attributed Graph Clustering", journal = j-TKDD, volume = "9", number = "1", pages = "5:1--5:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629616", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Graph clustering, also known as community detection, is a long-standing problem in data mining. In recent years, with the proliferation of rich attribute information available for objects in real-world graphs, how to leverage not only structural but also attribute information for clustering attributed graphs becomes a new challenge. Most existing works took a distance-based approach. They proposed various distance measures to fuse structural and attribute information and then applied standard techniques for graph clustering based on these distance measures. In this article, we take an alternative view and propose a novel Bayesian framework for attributed graph clustering. Our framework provides a general and principled solution to modeling both the structural and the attribute aspects of a graph. It avoids the artificial design of a distance measure in existing methods and, furthermore, can seamlessly handle graphs with different types of edges and vertex attributes. We develop an efficient variational method for graph clustering under this framework and derive two concrete algorithms for clustering unweighted and weighted attributed graphs. Experimental results on large real-world datasets show that our algorithms significantly outperform the state-of-the-art distance-based method, in terms of both effectiveness and efficiency.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Coscia:2014:UHO, author = "Michele Coscia and Giulio Rossetti and Fosca Giannotti and Dino Pedreschi", title = "Uncovering Hierarchical and Overlapping Communities with a Local-First Approach", journal = j-TKDD, volume = "9", number = "1", pages = "6:1--6:??", month = aug, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629511", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Aug 26 17:49:05 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Community discovery in complex networks is the task of organizing a network's structure by grouping together nodes related to each other. Traditional approaches are based on the assumption that there is a global-level organization in the network. However, in many scenarios, each node is the bearer of complex information and cannot be classified in disjoint clusters. The top-down global view of the partition approach is not designed for this. Here, we represent this complex information as multiple latent labels, and we postulate that edges in the networks are created among nodes carrying similar labels. The latent labels are the communities a node belongs to and we discover them with a simple local-first approach to community discovery. This is achieved by democratically letting each node vote for the communities it sees surrounding it in its limited view of the global system, its ego neighborhood, using a label propagation algorithm, assuming that each node is aware of the label it shares with each of its connections. The local communities are merged hierarchically, unveiling the modular organization of the network at the global level and identifying overlapping groups and groups of groups. We tested this intuition against the state-of-the-art overlapping community discovery and found that our new method advances in the chosen scenarios in the quality of the obtained communities. We perform a test on benchmark and on real-world networks, evaluating the quality of the community coverage by using the extracted communities to predict the metadata attached to the nodes, which we consider external information about the latent labels. We also provide an explanation about why real-world networks contain overlapping communities and how our logic is able to capture them. Finally, we show how our method is deterministic, is incremental, and has a limited time complexity, so that it can be used on real-world scale networks.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2014:GML, author = "Guangtao Wang and Qinbao Song and Xueying Zhang and Kaiyuan Zhang", title = "A Generic Multilabel Learning-Based Classification Algorithm Recommendation Method", journal = j-TKDD, volume = "9", number = "1", pages = "7:1--7:??", month = oct, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629474", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Oct 10 17:19:10 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "As more and more classification algorithms continue to be developed, recommending appropriate algorithms to a given classification problem is increasingly important. This article first distinguishes the algorithm recommendation methods by two dimensions: (1) meta-features, which are a set of measures used to characterize the learning problems, and (2) meta-target, which represents the relative performance of the classification algorithms on the learning problem. In contrast to the existing algorithm recommendation methods whose meta-target is usually in the form of either the ranking of candidate algorithms or a single algorithm, this article proposes a new and natural multilabel form to describe the meta-target. This is due to the fact that there would be multiple algorithms being appropriate for a given problem in practice. Furthermore, a novel multilabel learning-based generic algorithm recommendation method is proposed, which views the algorithm recommendation as a multilabel learning problem and solves the problem by the mature multilabel learning algorithms. To evaluate the proposed multilabel learning-based recommendation method, extensive experiments with 13 well-known classification algorithms, two kinds of meta-targets such as algorithm ranking and single algorithm, and five different kinds of meta-features are conducted on 1,090 benchmark learning problems. The results show the effectiveness of our proposed multilabel learning-based recommendation method.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2014:EEM, author = "Pinghui Wang and John C. S. Lui and Bruno Ribeiro and Don Towsley and Junzhou Zhao and Xiaohong Guan", title = "Efficiently Estimating Motif Statistics of Large Networks", journal = j-TKDD, volume = "9", number = "2", pages = "8:1--8:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629564", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Exploring statistics of locally connected subgraph patterns (also known as network motifs) has helped researchers better understand the structure and function of biological and Online Social Networks (OSNs). Nowadays, the massive size of some critical networks-often stored in already overloaded relational databases-effectively limits the rate at which nodes and edges can be explored, making it a challenge to accurately discover subgraph statistics. In this work, we propose sampling methods to accurately estimate subgraph statistics from as few queried nodes as possible. We present sampling algorithms that efficiently and accurately estimate subgraph properties of massive networks. Our algorithms require no precomputation or complete network topology information. At the same time, we provide theoretical guarantees of convergence. We perform experiments using widely known datasets and show that, for the same accuracy, our algorithms require an order of magnitude less queries (samples) than the current state-of-the-art algorithms.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zheng:2014:FHE, author = "Li Zheng and Tao Li and Chris Ding", title = "A Framework for Hierarchical Ensemble Clustering", journal = j-TKDD, volume = "9", number = "2", pages = "9:1--9:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2611380", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Ensemble clustering, as an important extension of the clustering problem, refers to the problem of combining different (input) clusterings of a given dataset to generate a final (consensus) clustering that is a better fit in some sense than existing clusterings. Over the past few years, many ensemble clustering approaches have been developed. However, most of them are designed for partitional clustering methods, and few research efforts have been reported for ensemble hierarchical clustering methods. In this article, a hierarchical ensemble clustering framework that can naturally combine both partitional clustering and hierarchical clustering results is proposed. In addition, a novel method for learning the ultra-metric distance from the aggregated distance matrices and generating final hierarchical clustering with enhanced cluster separation is developed based on the ultra-metric distance for hierarchical clustering. We study three important problems: dendrogram description, dendrogram combination, and dendrogram selection. We develop two approaches for dendrogram selection based on tree distances, and we investigate various dendrogram distances for representing dendrograms. We provide a systematic empirical study of the ensemble hierarchical clustering problem. Experimental results demonstrate the effectiveness of our proposed approaches.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huai:2014:TPC, author = "Baoxing Huai and Enhong Chen and Hengshu Zhu and Hui Xiong and Tengfei Bao and Qi Liu and Jilei Tian", title = "Toward Personalized Context Recognition for Mobile Users: a Semisupervised {Bayesian} {HMM} Approach", journal = j-TKDD, volume = "9", number = "2", pages = "10:1--10:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629504", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The problem of mobile context recognition targets the identification of semantic meaning of context in a mobile environment. This plays an important role in understanding mobile user behaviors and thus provides the opportunity for the development of better intelligent context-aware services. A key step of context recognition is to model the personalized contextual information of mobile users. Although many studies have been devoted to mobile context modeling, limited efforts have been made on the exploitation of the sequential and dependency characteristics of mobile contextual information. Also, the latent semantics behind mobile context are often ambiguous and poorly understood. Indeed, a promising direction is to incorporate some domain knowledge of common contexts, such as ``waiting for a bus'' or ``having dinner,'' by modeling both labeled and unlabeled context data from mobile users because there are often few labeled contexts available in practice. To this end, in this article, we propose a sequence-based semisupervised approach to modeling personalized context for mobile users. Specifically, we first exploit the Bayesian Hidden Markov Model (B-HMM) for modeling context in the form of probabilistic distributions and transitions of raw context data. Also, we propose a sequential model by extending B-HMM with the prior knowledge of contextual features to model context more accurately. Then, to efficiently learn the parameters and initial values of the proposed models, we develop a novel approach for parameter estimation by integrating the Dirichlet Process Mixture (DPM) model and the Mixture Unigram (MU) model. Furthermore, by incorporating both user-labeled and unlabeled data, we propose a semisupervised learning-based algorithm to identify and model the latent semantics of context. Finally, experimental results on real-world data clearly validate both the efficiency and effectiveness of the proposed approaches for recognizing personalized context of mobile users.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2014:ADI, author = "Siyuan Liu and Lei Chen and Lionel M. Ni", title = "Anomaly Detection from Incomplete Data", journal = j-TKDD, volume = "9", number = "2", pages = "11:1--11:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629668", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Anomaly detection (a.k.a., outlier or burst detection) is a well-motivated problem and a major data mining and knowledge discovery task. In this article, we study the problem of population anomaly detection, one of the key issues related to event monitoring and population management within a city. Through studying detected population anomalies, we can trace and analyze these anomalies, which could help to model city traffic design and event impact analysis and prediction. Although a significant and interesting issue, it is very hard to detect population anomalies and retrieve anomaly trajectories, especially given that it is difficult to get actual and sufficient population data. To address the difficulties of a lack of real population data, we take advantage of mobile phone networks, which offer enormous spatial and temporal communication data on persons. More importantly, we claim that we can utilize these mobile phone data to infer and approximate population data. Thus, we can study the population anomaly detection problem by taking advantages of unique features hidden in mobile phone data. In this article, we present a system to conduct Population Anomaly Detection (PAD). First, we propose an effective clustering method, correlation-based clustering, to cluster the incomplete location information from mobile phone data (i.e., from mobile call volume distribution to population density distribution). Then, we design an adaptive parameter-free detection method, R-scan, to capture the distributed dynamic anomalies. Finally, we devise an efficient algorithm, BT-miner, to retrieve anomaly trajectories. The experimental results from real-life mobile phone data confirm the effectiveness and efficiency of the proposed algorithms. Finally, the proposed methods are realized as a pilot system in a city in China.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gundecha:2014:UVR, author = "Pritam Gundecha and Geoffrey Barbier and Jiliang Tang and Huan Liu", title = "User Vulnerability and Its Reduction on a Social Networking Site", journal = j-TKDD, volume = "9", number = "2", pages = "12:1--12:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2630421", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Privacy and security are major concerns for many users of social media. When users share information (e.g., data and photos) with friends, they can make their friends vulnerable to security and privacy breaches with dire consequences. With the continuous expansion of a user's social network, privacy settings alone are often inadequate to protect a user's profile. In this research, we aim to address some critical issues related to privacy protection: (1) How can we measure and assess individual users' vulnerability? (2) With the diversity of one's social network friends, how can one figure out an effective approach to maintaining balance between vulnerability and social utility? In this work, first we present a novel way to define vulnerable friends from an individual user's perspective. User vulnerability is dependent on whether or not the user's friends' privacy settings protect the friend and the individual's network of friends (which includes the user). We show that it is feasible to measure and assess user vulnerability and reduce one's vulnerability without changing the structure of a social networking site. The approach is to unfriend one's most vulnerable friends. However, when such a vulnerable friend is also socially important, unfriending him or her would significantly reduce one's own social status. We formulate this novel problem as vulnerability minimization with social utility constraints. We formally define the optimization problem and provide an approximation algorithm with a proven bound. Finally, we conduct a large-scale evaluation of a new framework using a Facebook dataset. We resort to experiments and observe how much vulnerability an individual user can be decreased by unfriending a vulnerable friend. We compare performance of different unfriending strategies and discuss the security risk of new friend requests. Additionally, by employing different forms of social utility, we confirm that the balance between user vulnerability and social utility can be practically achieved.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Duan:2014:SRC, author = "Lian Duan and W. Nick Street and Yanchi Liu and Songhua Xu and Brook Wu", title = "Selecting the Right Correlation Measure for Binary Data", journal = j-TKDD, volume = "9", number = "2", pages = "13:1--13:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2637484", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Finding the most interesting correlations among items is essential for problems in many commercial, medical, and scientific domains. Although there are numerous measures available for evaluating correlations, different correlation measures provide drastically different results. Piatetsky-Shapiro provided three mandatory properties for any reasonable correlation measure, and Tan et al. proposed several properties to categorize correlation measures; however, it is still hard for users to choose the desirable correlation measures according to their needs. In order to solve this problem, we explore the effectiveness problem in three ways. First, we propose two desirable properties and two optional properties for correlation measure selection and study the property satisfaction for different correlation measures. Second, we study different techniques to adjust correlation measures and propose two new correlation measures: the Simplified $ \chi^2 $ with Continuity Correction and the Simplified $ \chi^2 $ with Support. Third, we analyze the upper and lower bounds of different measures and categorize them by the bound differences. Combining these three directions, we provide guidelines for users to choose the proper measure according to their needs.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2014:PBA, author = "Hao Huang and Hong Qin and Shinjae Yoo and Dantong Yu", title = "Physics-Based Anomaly Detection Defined on Manifold Space", journal = j-TKDD, volume = "9", number = "2", pages = "14:1--14:??", month = sep, year = "2014", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2641574", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Oct 7 18:49:26 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Current popular anomaly detection algorithms are capable of detecting global anomalies but often fail to distinguish local anomalies from normal instances. Inspired by contemporary physics theory (i.e., heat diffusion and quantum mechanics), we propose two unsupervised anomaly detection algorithms. Building on the embedding manifold derived from heat diffusion, we devise Local Anomaly Descriptor (LAD), which faithfully reveals the intrinsic neighborhood density. It uses a scale-dependent umbrella operator to bridge global and local properties, which makes LAD more informative within an adaptive scope of neighborhood. To offer more stability of local density measurement on scaling parameter tuning, we formulate Fermi Density Descriptor (FDD), which measures the probability of a fermion particle being at a specific location. By choosing the stable energy distribution function, FDD steadily distinguishes anomalies from normal instances with any scaling parameter setting. To further enhance the efficacy of our proposed algorithms, we explore the utility of anisotropic Gaussian kernel (AGK), which offers better manifold-aware affinity information. We also quantify and examine the effect of different Laplacian normalizations for anomaly detection. Comprehensive experiments on both synthetic and benchmark datasets verify that our proposed algorithms outperform the existing anomaly detection algorithms.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gionis:2015:ISI, author = "Aristides Gionis and Hang Li", title = "Introduction to the Special Issue {{ACM} {SIGKDD}} 2013", journal = j-TKDD, volume = "9", number = "3", pages = "15:1--15:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700993", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "15e", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jha:2015:SES, author = "Madhav Jha and C. Seshadhri and Ali Pinar", title = "A Space-Efficient Streaming Algorithm for Estimating Transitivity and Triangle Counts Using the Birthday Paradox", journal = j-TKDD, volume = "9", number = "3", pages = "15:1--15:??", month = feb, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700395", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 6 09:34:37 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We design a space-efficient algorithm that approximates the transitivity (global clustering coefficient) and total triangle count with only a single pass through a graph given as a stream of edges. Our procedure is based on the classic probabilistic result, the birthday paradox. When the transitivity is constant and there are more edges than wedges (common properties for social networks), we can prove that our algorithm requires $O( \sqrt n )$ space ($n$ is the number of vertices) to provide accurate estimates. We run a detailed set of experiments on a variety of real graphs and demonstrate that the memory requirement of the algorithm is a tiny fraction of the graph. For example, even for a graph with 200 million edges, our algorithm stores just 40,000 edges to give accurate results. Being a single pass streaming algorithm, our procedure also maintains a real-time estimate of the transitivity/number of triangles of a graph by storing a minuscule fraction of edges.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tang:2015:FMT, author = "Lu-An Tang and Xiao Yu and Quanquan Gu and Jiawei Han and Guofei Jiang and Alice Leung and Thomas {La Porta}", title = "A Framework of Mining Trajectories from Untrustworthy Data in Cyber-Physical System", journal = j-TKDD, volume = "9", number = "3", pages = "16:1--16:??", month = feb, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700394", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 6 09:34:37 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "A cyber-physical system (CPS) integrates physical (i.e., sensor) devices with cyber (i.e., informational) components to form a context-sensitive system that responds intelligently to dynamic changes in real-world situations. The CPS has wide applications in scenarios such as environment monitoring, battlefield surveillance, and traffic control. One key research problem of CPS is called mining lines in the sand. With a large number of sensors (sand) deployed in a designated area, the CPS is required to discover all trajectories (lines) of passing intruders in real time. There are two crucial challenges that need to be addressed: (1) the collected sensor data are not trustworthy, and (2) the intruders do not send out any identification information. The system needs to distinguish multiple intruders and track their movements. This study proposes a method called LiSM (Line-in-the-Sand Miner) to discover trajectories from untrustworthy sensor data. LiSM constructs a watching network from sensor data and computes the locations of intruder appearances based on the link information of the network. The system retrieves a cone model from the historical trajectories to track multiple intruders. Finally, the system validates the mining results and updates sensors' reliability scores in a feedback process. In addition, LoRM (Line-on-the-Road Miner) is proposed for trajectory discovery on road networks- mining lines on the roads. LoRM employs a filtering-and-refinement framework to reduce the distance computational overhead on road networks and uses a shortest-path-measure to track intruders. The proposed methods are evaluated with extensive experiments on big datasets. The experimental results show that the proposed methods achieve higher accuracy and efficiency in trajectory mining tasks.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2015:QDR, author = "Zheng Wang and Jieping Ye", title = "Querying Discriminative and Representative Samples for Batch Mode Active Learning", journal = j-TKDD, volume = "9", number = "3", pages = "17:1--17:??", month = feb, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700408", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Fri Mar 6 09:34:37 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Empirical risk minimization (ERM) provides a principled guideline for many machine learning and data mining algorithms. Under the ERM principle, one minimizes an upper bound of the true risk, which is approximated by the summation of empirical risk and the complexity of the candidate classifier class. To guarantee a satisfactory learning performance, ERM requires that the training data are i.i.d. sampled from the unknown source distribution. However, this may not be the case in active learning, where one selects the most informative samples to label, and these data may not follow the source distribution. In this article, we generalize the ERM principle to the active learning setting. We derive a novel form of upper bound for the true risk in the active learning setting; by minimizing this upper bound, we develop a practical batch mode active learning method. The proposed formulation involves a nonconvex integer programming optimization problem. We solve it efficiently by an alternating optimization method. Our method is shown to query the most informative samples while preserving the source distribution as much as possible, thus identifying the most uncertain and representative queries. We further extend our method to multiclass active learning by introducing novel pseudolabels in the multiclass case and developing an efficient algorithm. Experiments on benchmark datasets and real-world applications demonstrate the superior performance of our proposed method compared to state-of-the-art methods.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Gopal:2015:HBI, author = "Siddharth Gopal and Yiming Yang", title = "Hierarchical {Bayesian} Inference and Recursive Regularization for Large-Scale Classification", journal = j-TKDD, volume = "9", number = "3", pages = "18:1--18:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2629585", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article, we address open challenges in large-scale classification, focusing on how to effectively leverage the dependency structures (hierarchical or graphical) among class labels, and how to make the inference scalable in jointly optimizing all model parameters. We propose two main approaches, namely the hierarchical Bayesian inference framework and the recursive regularization scheme. The key idea in both approaches is to reinforce the similarity among parameter across the nodes in a hierarchy or network based on the proximity and connectivity of the nodes. For scalability, we develop hierarchical variational inference algorithms and fast dual coordinate descent training procedures with parallelization. In our experiments for classification problems with hundreds of thousands of classes and millions of training instances with terabytes of parameters, the proposed methods show consistent and statistically significant improvements over other competing approaches, and the best results on multiple benchmark datasets for large-scale classification.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yin:2015:MLB, author = "Hongzhi Yin and Bin Cui and Ling Chen and Zhiting Hu and Chengqi Zhang", title = "Modeling Location-Based User Rating Profiles for Personalized Recommendation", journal = j-TKDD, volume = "9", number = "3", pages = "19:1--19:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2663356", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article proposes LA-LDA, a location-aware probabilistic generative model that exploits location-based ratings to model user profiles and produce recommendations. Most of the existing recommendation models do not consider the spatial information of users or items; however, LA-LDA supports three classes of location-based ratings, namely spatial user ratings for nonspatial items, nonspatial user ratings for spatial items, and spatial user ratings for spatial items. LA-LDA consists of two components, ULA-LDA and ILA-LDA, which are designed to take into account user and item location information, respectively. The component ULA-LDA explicitly incorporates and quantifies the influence from local public preferences to produce recommendations by considering user home locations, whereas the component ILA-LDA recommends items that are closer in both taste and travel distance to the querying users by capturing item co-occurrence patterns, as well as item location co-occurrence patterns. The two components of LA-LDA can be applied either separately or collectively, depending on the available types of location-based ratings. To demonstrate the applicability and flexibility of the LA-LDA model, we deploy it to both top- k recommendation and cold start recommendation scenarios. Experimental evidence on large-scale real-world data, including the data from Gowalla (a location-based social network), DoubanEvent (an event-based social network), and MovieLens (a movie recommendation system), reveal that LA-LDA models user profiles more accurately by outperforming existing recommendation models for top- k recommendation and the cold start problem.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Hu:2015:PSD, author = "Juhua Hu and De-Chuan Zhan and Xintao Wu and Yuan Jiang and Zhi-Hua Zhou", title = "Pairwised Specific Distance Learning from Physical Linkages", journal = j-TKDD, volume = "9", number = "3", pages = "20:1--20:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700405", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In real tasks, usually a good classification performance can only be obtained when a good distance metric is obtained; therefore, distance metric learning has attracted significant attention in the past few years. Typical studies of distance metric learning evaluate how to construct an appropriate distance metric that is able to separate training data points from different classes or satisfy a set of constraints (e.g., must-links and/or cannot-links). It is noteworthy that this task becomes challenging when there are only limited labeled training data points and no constraints are given explicitly. Moreover, most existing approaches aim to construct a global distance metric that is applicable to all data points. However, different data points may have different properties and may require different distance metrics. We notice that data points in real tasks are often connected by physical links (e.g., people are linked with each other in social networks; personal webpages are often connected to other webpages, including nonpersonal webpages), but the linkage information has not been exploited in distance metric learning. In this article, we develop a pairwised specific distance (PSD) approach that exploits the structures of physical linkages and in particular captures the key observations that nonmetric and clique linkages imply the appearance of different or unique semantics, respectively. It is noteworthy that, rather than generating a global distance, PSD generates different distances for different pairs of data points; this property is desired in applications involving complicated data semantics. We mainly present PSD for multi-class learning and further extend it to multi-label learning. Experimental results validate the effectiveness of PSD, especially in the scenarios in which there are very limited labeled training data points and no explicit constraints are given.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Soundarajan:2015:ULG, author = "Sucheta Soundarajan and John E. Hopcroft", title = "Use of Local Group Information to Identify Communities in Networks", journal = j-TKDD, volume = "9", number = "3", pages = "21:1--21:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700404", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The recent interest in networks has inspired a broad range of work on algorithms and techniques to characterize, identify, and extract communities from networks. Such efforts are complicated by a lack of consensus on what a ``community'' truly is, and these disagreements have led to a wide variety of mathematical formulations for describing communities. Often, these mathematical formulations, such as modularity and conductance, have been founded in the general principle that communities, like a G ( n, p ) graph, are ``round,'' with connections throughout the entire community, and so algorithms were developed to optimize such mathematical measures. More recently, a variety of algorithms have been developed that, rather than expecting connectivity through the entire community, seek out very small groups of well-connected nodes and then connect these groups into larger communities. In this article, we examine seven real networks, each containing external annotation that allows us to identify ``annotated communities.'' A study of these annotated communities gives insight into why the second category of community detection algorithms may be more successful than the first category. We then present a flexible algorithm template that is based on the idea of joining together small sets of nodes. In this template, we first identify very small, tightly connected ``subcommunities'' of nodes, each corresponding to a single node's ``perception'' of the network around it. We then create a new network in which each node represents such a subcommunity, and then identify communities in this new network. Because each node can appear in multiple subcommunities, this method allows us to detect overlapping communities. When evaluated on real data, we show that our template outperforms many other state-of-the-art algorithms.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2015:UCN, author = "Pinghui Wang and Junzhou Zhao and John C. S. Lui and Don Towsley and Xiaohong Guan", title = "Unbiased Characterization of Node Pairs over Large Graphs", journal = j-TKDD, volume = "9", number = "3", pages = "22:1--22:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700393", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Characterizing user pair relationships is important for applications such as friend recommendation and interest targeting in online social networks (OSNs). Due to the large-scale nature of such networks, it is infeasible to enumerate all user pairs and thus sampling is used. In this article, we show that it is a great challenge for OSN service providers to characterize user pair relationships, even when they possess the complete graph topology. The reason is that when sampling techniques (i.e., uniform vertex sampling (UVS) and random walk (RW)) are naively applied, they can introduce large biases, particularly for estimating similarity distribution of user pairs with constraints like existence of mutual neighbors, which is important for applications such as identifying network homophily. Estimating statistics of user pairs is more challenging in the absence of the complete topology information, as an unbiased sampling technique like UVS is usually not allowed and exploring the OSN graph topology is expensive. To address these challenges, we present unbiased sampling methods to characterize user pair properties based on UVS and RW techniques. We carry out an evaluation of our methods to show their accuracy and efficiency. Finally, we apply our methods to three OSNs-Foursquare, Douban, and Xiami-and discover that significant homophily is present in these networks.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Vlachos:2015:DPC, author = "Michail Vlachos and Johannes Schneider and Vassilios G. Vassiliadis", title = "On Data Publishing with Clustering Preservation", journal = j-TKDD, volume = "9", number = "3", pages = "23:1--23:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700403", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The emergence of cloud-based storage services is opening up new avenues in data exchange and data dissemination. This has amplified the interest in right-protection mechanisms to establish ownership in the event of data leakage. Current right-protection technologies, however, rarely provide strong guarantees on dataset utility after the protection process. This work presents techniques that explicitly address this topic and provably preserve the outcome of certain mining operations. In particular, we take special care to guarantee that the outcome of hierarchical clustering operations remains the same before and after right protection. Our approach considers all prevalent hierarchical clustering variants: single-, complete-, and average-linkage. We imprint the ownership in a dataset using watermarking principles, and we derive tight bounds on the expansion/contraction of distances incurred by the process. We leverage our analysis to design fast algorithms for right protection without exhaustively searching the vast design space. Finally, because the right-protection process introduces a user-tunable distortion on the dataset, we explore the possibility of using this mechanism for data obfuscation. We quantify the tradeoff between obfuscation and utility for spatiotemporal datasets and discover very favorable characteristics of the process. An additional advantage is that when one is interested in both right-protecting and obfuscating the original data values, the proposed mechanism can accomplish both tasks simultaneously.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{VazDeMelo:2015:UDP, author = "Pedro O. S. {Vaz De Melo} and Christos Faloutsos and Renato Assun{\c{c}}{\~a}o and Rodrigo Alves and Antonio A. F. Loureiro", title = "Universal and Distinct Properties of Communication Dynamics: How to Generate Realistic Inter-event Times", journal = j-TKDD, volume = "9", number = "3", pages = "24:1--24:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700399", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "With the advancement of information systems, means of communications are becoming cheaper, faster, and more available. Today, millions of people carrying smartphones or tablets are able to communicate practically any time and anywhere they want. They can access their e-mails, comment on weblogs, watch and post videos and photos (as well as comment on them), and make phone calls or text messages almost ubiquitously. Given this scenario, in this article, we tackle a fundamental aspect of this new era of communication: How the time intervals between communication events behave for different technologies and means of communications. Are there universal patterns for the Inter-Event Time Distribution (IED)? How do inter-event times behave differently among particular technologies? To answer these questions, we analyzed eight different datasets from real and modern communication data and found four well-defined patterns seen in all the eight datasets. Moreover, we propose the use of the Self-Feeding Process (SFP) to generate inter-event times between communications. The SFP is an extremely parsimonious point process that requires at most two parameters and is able to generate inter-event times with all the universal properties we observed in the data. We also show three potential applications of the SFP: as a framework to generate a synthetic dataset containing realistic communication events of any one of the analyzed means of communications, as a technique to detect anomalies, and as a building block for more specific models that aim to encompass the particularities seen in each of the analyzed systems.", acknowledgement = ack-nhfb, articleno = "24", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2015:WIY, author = "Jing Zhang and Jie Tang and Juanzi Li and Yang Liu and Chunxiao Xing", title = "Who Influenced You? {Predicting} Retweet via Social Influence Locality", journal = j-TKDD, volume = "9", number = "3", pages = "25:1--25:??", month = apr, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700398", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Apr 14 09:22:28 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Social influence occurs when one's opinions, emotions, or behaviors are affected by others in a social network. However, social influence takes many forms, and its underlying mechanism is still unclear. For example, how is one's behavior influenced by a group of friends who know each other and by the friends from different ego friend circles? In this article, we study the social influence problem in a large microblogging network. Particularly, we consider users' (re)tweet behaviors and focus on investigating how friends in one's ego network influence retweet behaviors. We propose a novel notion of social influence locality and develop two instantiation functions based on pairwise influence and structural diversity. The defined influence locality functions have strong predictive power. Without any additional features, we can obtain an F1-score of 71.65\% for predicting users' retweet behaviors by training a logistic regression classifier based on the defined influence locality functions. We incorporate social influence locality into a factor graph model, which can further leverage the network-based correlation. Our experiments on the large microblogging network show that the model significantly improves the precision of retweet prediction. Our analysis also reveals several intriguing discoveries. For example, if you have six friends retweeting a microblog, the average likelihood that you will also retweet it strongly depends on the structure among the six friends: The likelihood will significantly drop (only 1/6 ) when the six friends do not know each other, compared with the case when the six friends know each other.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xie:2015:MMA, author = "Hong Xie and John C. S. Lui", title = "Mathematical Modeling and Analysis of Product Rating with Partial Information", journal = j-TKDD, volume = "9", number = "4", pages = "26:1--26:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700386", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Many Web services like Amazon, Epinions, and TripAdvisor provide historical product ratings so that users can evaluate the quality of products. Product ratings are important because they affect how well a product will be adopted by the market. The challenge is that we only have partial information on these ratings: each user assigns ratings to only a small subset of products. Under this partial information setting, we explore a number of fundamental questions. What is the minimum number of ratings a product needs so that one can make a reliable evaluation of its quality? How may users' misbehavior, such as cheating in product rating, affect the evaluation result? To answer these questions, we present a probabilistic model to capture various important factors (e.g., rating aggregation rules, rating behavior) that may influence the product quality assessment under the partial information setting. We derive the minimum number of ratings needed to produce a reliable indicator on the quality of a product. We extend our model to accommodate users' misbehavior in product rating. We derive the maximum fraction of misbehaving users that a rating aggregation rule can tolerate and the minimum number of ratings needed to compensate. We carry out experiments using both synthetic and real-world data (from Amazon and TripAdvisor). We not only validate our model but also show that the ``average rating rule'' produces more reliable and robust product quality assessments than the ``majority rating rule'' and the ``median rating rule'' in aggregating product ratings. Last, we perform experiments on two movie rating datasets (from Flixster and Netflix) to demonstrate how to apply our framework to improve the applications of recommender systems.", acknowledgement = ack-nhfb, articleno = "26", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Esuli:2015:OTQ, author = "Andrea Esuli and Fabrizio Sebastiani", title = "Optimizing Text Quantifiers for Multivariate Loss Functions", journal = j-TKDD, volume = "9", number = "4", pages = "27:1--27:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700406", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We address the problem of quantification, a supervised learning task whose goal is, given a class, to estimate the relative frequency (or prevalence ) of the class in a dataset of unlabeled items. Quantification has several applications in data and text mining, such as estimating the prevalence of positive reviews in a set of reviews of a given product or estimating the prevalence of a given support issue in a dataset of transcripts of phone calls to tech support. So far, quantification has been addressed by learning a general-purpose classifier, counting the unlabeled items that have been assigned the class, and tuning the obtained counts according to some heuristics. In this article, we depart from the tradition of using general-purpose classifiers and use instead a supervised learning model for structured prediction, capable of generating classifiers directly optimized for the (multivariate and nonlinear) function used for evaluating quantification accuracy. The experiments that we have run on 5,500 binary high-dimensional datasets (averaging more than 14,000 documents each) show that this method is more accurate, more stable, and more efficient than existing state-of-the-art quantification methods.", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lin:2015:IMS, author = "Bing-Rong Lin and Daniel Kifer", title = "Information Measures in Statistical Privacy and Data Processing Applications", journal = j-TKDD, volume = "9", number = "4", pages = "28:1--28:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700407", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In statistical privacy, utility refers to two concepts: information preservation, how much statistical information is retained by a sanitizing algorithm, and usability, how (and with how much difficulty) one extracts this information to build statistical models, answer queries, and so forth. Some scenarios incentivize a separation between information preservation and usability, so that the data owner first chooses a sanitizing algorithm to maximize a measure of information preservation, and, afterward, the data consumers process the sanitized output according to their various individual needs [Ghosh et al. 2009; Williams and McSherry 2010]. We analyze the information-preserving properties of utility measures with a combination of two new and three existing utility axioms and study how violations of an axiom can be fixed. We show that the average (over possible outputs of the sanitizer) error of Bayesian decision makers forms the unique class of utility measures that satisfy all of the axioms. The axioms are agnostic to Bayesian concepts such as subjective probabilities and hence strengthen support for Bayesian views in privacy research. In particular, this result connects information preservation to aspects of usability-if the information preservation of a sanitizing algorithm should be measured as the average error of a Bayesian decision maker, shouldn't Bayesian decision theory be a good choice when it comes to using the sanitized outputs for various purposes? We put this idea to the test in the unattributed histogram problem where our decision-theoretic postprocessing algorithm empirically outperforms previously proposed approaches.", acknowledgement = ack-nhfb, articleno = "28", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Huang:2015:DAC, author = "Hao Huang and Shinjae Yoo and Dantong Yu and Hong Qin", title = "Density-Aware Clustering Based on Aggregated Heat Kernel and Its Transformation", journal = j-TKDD, volume = "9", number = "4", pages = "29:1--29:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700385", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Current spectral clustering algorithms suffer from the sensitivity to existing noise and parameter scaling and may not be aware of different density distributions across clusters. If these problems are left untreated, the consequent clustering results cannot accurately represent true data patterns, in particular, for complex real-world datasets with heterogeneous densities. This article aims to solve these problems by proposing a diffusion-based Aggregated Heat Kernel (AHK) to improve the clustering stability, and a Local Density Affinity Transformation (LDAT) to correct the bias originating from different cluster densities. AHK statistically models the heat diffusion traces along the entire time scale, so it ensures robustness during the clustering process, while LDAT probabilistically reveals the local density of each instance and suppresses the local density bias in the affinity matrix. Our proposed framework integrates these two techniques systematically. As a result, it not only provides an advanced noise-resisting and density-aware spectral mapping to the original dataset but also demonstrates the stability during the processing of tuning the scaling parameter (which usually controls the range of neighborhood). Furthermore, our framework works well with the majority of similarity kernels, which ensures its applicability to many types of data and problem domains. The systematic experiments on different applications show that our proposed algorithm outperforms state-of-the-art clustering algorithms for the data with heterogeneous density distributions and achieves robust clustering performance with respect to tuning the scaling parameter and handling various levels and types of noise.", acknowledgement = ack-nhfb, articleno = "29", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2015:CSF, author = "Kui Yu and Wei Ding and Dan A. Simovici and Hao Wang and Jian Pei and Xindong Wu", title = "Classification with Streaming Features: an Emerging-Pattern Mining Approach", journal = j-TKDD, volume = "9", number = "4", pages = "30:1--30:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2700409", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Many datasets from real-world applications have very high-dimensional or increasing feature space. It is a new research problem to learn and maintain a classifier to deal with very high dimensionality or streaming features. In this article, we adapt the well-known emerging-pattern--based classification models and propose a semi-streaming approach. For streaming features, it is computationally expensive or even prohibitive to mine long-emerging patterns, and it is nontrivial to integrate emerging-pattern mining with feature selection. We present an online feature selection step, which is capable of selecting and maintaining a pool of effective features from a feature stream. Then, in our offline step, separated from the online step, we periodically compute and update emerging patterns from the pool of selected features from the online step. We evaluate the effectiveness and efficiency of the proposed method using a series of benchmark datasets and a real-world case study on Mars crater detection. Our proposed method yields classification performance comparable to the state-of-art static classification methods. Most important, the proposed method is significantly faster and can efficiently handle datasets with streaming features.", acknowledgement = ack-nhfb, articleno = "30", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2015:SEH, author = "Guimei Liu and Haojun Zhang and Mengling Feng and Limsoon Wong and See-Kiong Ng", title = "Supporting Exploratory Hypothesis Testing and Analysis", journal = j-TKDD, volume = "9", number = "4", pages = "31:1--31:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2701430", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Conventional hypothesis testing is carried out in a hypothesis-driven manner. A scientist must first formulate a hypothesis based on what he or she sees and then devise a variety of experiments to test it. Given the rapid growth of data, it has become virtually impossible for a person to manually inspect all data to find all of the interesting hypotheses for testing. In this article, we propose and develop a data-driven framework for automatic hypothesis testing and analysis. We define a hypothesis as a comparison between two or more subpopulations. We find subpopulations for comparison using frequent pattern mining techniques and then pair them up for statistical hypothesis testing. We also generate additional information for further analysis of the hypotheses that are deemed significant. The number of hypotheses generated can be very large, and many of them are very similar. We develop algorithms to remove redundant hypotheses and present a succinct set of significant hypotheses to users. We conducted a set of experiments to show the efficiency and effectiveness of the proposed algorithms. The results show that our system can help users (1) identify significant hypotheses efficiently, (2) isolate the reasons behind significant hypotheses efficiently, and (3) find confounding factors that form Simpson's paradoxes with discovered significant hypotheses.", acknowledgement = ack-nhfb, articleno = "31", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Greco:2015:PDU, author = "Gianluigi Greco and Antonella Guzzo and Francesco Lupia and Luigi Pontieri", title = "Process Discovery under Precedence Constraints", journal = j-TKDD, volume = "9", number = "4", pages = "32:1--32:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2710020", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Process discovery has emerged as a powerful approach to support the analysis and the design of complex processes. It consists of analyzing a set of traces registering the sequence of tasks performed along several enactments of a transactional system, in order to build a process model that can explain all the episodes recorded over them. An approach to accomplish this task is presented that can benefit from the background knowledge that, in many cases, is available to the analysts taking care of the process (re-)design. The approach is based on encoding the information gathered from the log and the (possibly) given background knowledge in terms of precedence constraints, that is, of constraints over the topology of the resulting process models. Mining algorithms are eventually formulated in terms of reasoning problems over precedence constraints, and the computational complexity of such problems is thoroughly analyzed by tracing their tractability frontier. Solution algorithms are proposed and their properties analyzed. These algorithms have been implemented in a prototype system, and results of a thorough experimental activity are discussed.", acknowledgement = ack-nhfb, articleno = "32", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Mirbakhsh:2015:ITR, author = "Nima Mirbakhsh and Charles X. Ling", title = "Improving Top-{$N$} Recommendation for Cold-Start Users via Cross-Domain Information", journal = j-TKDD, volume = "9", number = "4", pages = "33:1--33:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2724720", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Making accurate recommendations for cold-start users is a challenging yet important problem in recommendation systems. Including more information from other domains is a natural solution to improve the recommendations. However, most previous work in cross-domain recommendations has focused on improving prediction accuracy with several severe limitations. In this article, we extend our previous work on clustering-based matrix factorization in single domains into cross domains. In addition, we utilize recent results on unobserved ratings. Our new method can more effectively utilize data from auxiliary domains to achieve better recommendations, especially for cold-start users. For example, our method improves the recall to 21\% on average for cold-start users, whereas previous methods result in only 15\% recall in the cross-domain Amazon dataset. We also observe almost the same improvements in the Epinions dataset. Considering that it is often difficult to make even a small improvement in recommendations, for cold-start users in particular, our result is quite significant.", acknowledgement = ack-nhfb, articleno = "33", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Bonchi:2015:CCC, author = "Francesco Bonchi and Aristides Gionis and Francesco Gullo and Charalampos E. Tsourakakis and Antti Ukkonen", title = "Chromatic Correlation Clustering", journal = j-TKDD, volume = "9", number = "4", pages = "34:1--34:??", month = jun, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2728170", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Jun 3 06:21:22 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We study a novel clustering problem in which the pairwise relations between objects are categorical. This problem can be viewed as clustering the vertices of a graph whose edges are of different types ( colors ). We introduce an objective function that ensures the edges within each cluster have, as much as possible, the same color. We show that the problem is NP -hard and propose a randomized algorithm with approximation guarantee proportional to the maximum degree of the input graph. The algorithm iteratively picks a random edge as a pivot, builds a cluster around it, and removes the cluster from the graph. Although being fast, easy to implement, and parameter-free, this algorithm tends to produce a relatively large number of clusters. To overcome this issue we introduce a variant algorithm, which modifies how the pivot is chosen and how the cluster is built around the pivot. Finally, to address the case where a fixed number of output clusters is required, we devise a third algorithm that directly optimizes the objective function based on the alternating-minimization paradigm. We also extend our objective function to handle cases where object's relations are described by multiple labels. We modify our randomized approximation algorithm to optimize such an extended objective function and show that its approximation guarantee remains proportional to the maximum degree of the graph. We test our algorithms on synthetic and real data from the domains of social media, protein-interaction networks, and bibliometrics. Results reveal that our algorithms outperform a baseline algorithm both in the task of reconstructing a ground-truth clustering and in terms of objective-function value.", acknowledgement = ack-nhfb, articleno = "34", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2015:LSC, author = "Hua Wang and Feiping Nie and Heng Huang", title = "Large-Scale Cross-Language {Web} Page Classification via Dual Knowledge Transfer Using Fast Nonnegative Matrix Trifactorization", journal = j-TKDD, volume = "10", number = "1", pages = "1:1--1:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2710021", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "With the rapid growth of modern technologies, Internet has reached almost every corner of the world. As a result, it becomes more and more important to manage and mine information contained in Web pages in different languages. Traditional supervised learning methods usually require a large amount of training data to obtain accurate and robust classification models. However, labeled Web pages did not increase as fast as the growth of Internet. The lack of sufficient training Web pages in many languages, especially for those in uncommonly used languages, makes it a challenge for traditional classification algorithms to achieve satisfactory performance. To address this, we observe that Web pages for a same topic from different languages usually share some common semantic patterns, though in different representation forms. In addition, we also observe that the associations between word clusters and Web page classes are another type of reliable carriers to transfer knowledge across languages. With these recognitions, in this article we propose a novel joint nonnegative matrix trifactorization (NMTF) based Dual Knowledge Transfer (DKT) approach for cross-language Web page classification. Our approach transfers knowledge from the auxiliary language, in which abundant labeled Web pages are available, to the target languages, in which we want to classify Web pages, through two different paths: word cluster approximation and the associations between word clusters and Web page classes. With the reinforcement between these two different knowledge transfer paths, our approach can achieve better classification accuracy. In order to deal with the large-scale real world data, we further develop the proposed DKT approach by constraining the factor matrices of NMTF to be cluster indicator matrices. Due to the nature of cluster indicator matrices, we can decouple the proposed optimization objective and the resulted subproblems are of much smaller sizes involving much less matrix multiplications, which make our new approach much more computationally efficient. We evaluate the proposed approach in extensive experiments using a real world cross-language Web page data set. Promising results have demonstrated the effectiveness of our approach that are consistent with our theoretical analyses.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhou:2015:SIB, author = "Yang Zhou and Ling Liu", title = "Social Influence Based Clustering and Optimization over Heterogeneous Information Networks", journal = j-TKDD, volume = "10", number = "1", pages = "2:1--2:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2717314", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Social influence analysis has shown great potential for strategic marketing decision. It is well known that people influence one another based on both their social connections and the social activities that they have engaged in the past. In this article, we develop an innovative and high-performance social influence based graph clustering framework with four unique features. First, we explicitly distinguish social connection based influence (self-influence) and social activity based influence (co-influence). We compute the self-influence similarity between two members based on their social connections within a single collaboration network, and compute the co-influence similarity by taking into account not only the set of activities that people participate but also the semantic association between these activities. Second, we define the concept of influence-based similarity by introducing a unified influence-based similarity matrix that employs an iterative weight update method to integrate self-influence and co-influence similarities. Third, we design a dynamic learning algorithm, called SI-C luster, for social influence based graph clustering. It iteratively partitions a large social collaboration network into K clusters based on both the social network itself and the multiple associated activity information networks, each representing a category of activities that people have engaged. To make the SI-Cluster algorithm converge fast, we transform sophisticated nonlinear fractional programming problem with respect to multiple weights into a straightforward nonlinear parametric programming problem of single variable. Finally, we develop an optimization technique of diagonalizable-matrix approximation to speed up the computation of self-influence similarity and co-influence similarities. Our SI-Cluster-Opt significantly improves the efficiency of SI-Cluster on large graphs while maintaining high quality of clustering results. Extensive experimental evaluation on three real-world graphs shows that, compared to existing representative graph clustering algorithms, our SI-Cluster-Opt approach not only achieves a very good balance between self-influence and co-influence similarities but also scales extremely well for clustering large graphs in terms of time complexity while meeting the guarantee of high density, low entropy and low Davies--Bouldin Index.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Papalexakis:2015:PSP, author = "Evangelos E. Papalexakis and Christos Faloutsos and Nicholas D. Sidiropoulos", title = "{ParCube}: Sparse Parallelizable {CANDECOMP--PARAFAC} Tensor Decomposition", journal = j-TKDD, volume = "10", number = "1", pages = "3:1--3:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2729980", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How can we efficiently decompose a tensor into sparse factors, when the data do not fit in memory? Tensor decompositions have gained a steadily increasing popularity in data-mining applications; however, the current state-of-art decomposition algorithms operate on main memory and do not scale to truly large datasets. In this work, we propose ParCube, a new and highly parallelizable method for speeding up tensor decompositions that is well suited to produce sparse approximations. Experiments with even moderately large data indicate over 90\% sparser outputs and 14 times faster execution, with approximation error close to the current state of the art irrespective of computation and memory requirements. We provide theoretical guarantees for the algorithm's correctness and we experimentally validate our claims through extensive experiments, including four different real world datasets (Enron, Lbnl, Facebook and Nell), demonstrating its effectiveness for data-mining practitioners. In particular, we are the first to analyze the very large Nell dataset using a sparse tensor decomposition, demonstrating that ParCube enables us to handle effectively and efficiently very large datasets. Finally, we make our highly scalable parallel implementation publicly available, enabling reproducibility of our work.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Ahmed:2015:AMC, author = "Rezwan Ahmed and George Karypis", title = "Algorithms for Mining the Coevolving Relational Motifs in Dynamic Networks", journal = j-TKDD, volume = "10", number = "1", pages = "4:1--4:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2733380", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Computational methods and tools that can efficiently and effectively analyze the temporal changes in dynamic complex relational networks enable us to gain significant insights regarding the entity relations and their evolution. This article introduces a new class of dynamic graph patterns, referred to as coevolving relational motifs (CRMs), which are designed to identify recurring sets of entities whose relations change in a consistent way over time. CRMs can provide evidence to the existence of, possibly unknown, coordination mechanisms by identifying the relational motifs that evolve in a similar and highly conserved fashion. We developed an algorithm to efficiently analyze the frequent relational changes between the entities of the dynamic networks and capture all frequent coevolutions as CRMs. Our algorithm follows a depth-first exploration of the frequent CRM lattice and incorporates canonical labeling for redundancy elimination. Experimental results based on multiple real world dynamic networks show that the method is able to efficiently identify CRMs. In addition, a qualitative analysis of the results shows that the discovered patterns can be used as features to characterize the dynamic network.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Campello:2015:HDE, author = "Ricardo J. G. B. Campello and Davoud Moulavi and Arthur Zimek and J{\"o}rg Sander", title = "Hierarchical Density Estimates for Data Clustering, Visualization, and Outlier Detection", journal = j-TKDD, volume = "10", number = "1", pages = "5:1--5:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2733381", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "An integrated framework for density-based cluster analysis, outlier detection, and data visualization is introduced in this article. The main module consists of an algorithm to compute hierarchical estimates of the level sets of a density, following Hartigan's classic model of density-contour clusters and trees. Such an algorithm generalizes and improves existing density-based clustering techniques with respect to different aspects. It provides as a result a complete clustering hierarchy composed of all possible density-based clusters following the nonparametric model adopted, for an infinite range of density thresholds. The resulting hierarchy can be easily processed so as to provide multiple ways for data visualization and exploration. It can also be further postprocessed so that: (i) a normalized score of ``outlierness'' can be assigned to each data object, which unifies both the global and local perspectives of outliers into a single definition; and (ii) a ``flat'' (i.e., nonhierarchical) clustering solution composed of clusters extracted from local cuts through the cluster tree (possibly corresponding to different density thresholds) can be obtained, either in an unsupervised or in a semisupervised way. In the unsupervised scenario, the algorithm corresponding to this postprocessing module provides a global, optimal solution to the formal problem of maximizing the overall stability of the extracted clusters. If partially labeled objects or instance-level constraints are provided by the user, the algorithm can solve the problem by considering both constraints violations/satisfactions and cluster stability criteria. An asymptotic complexity analysis, both in terms of running time and memory space, is described. Experiments are reported that involve a variety of synthetic and real datasets, including comparisons with state-of-the-art, density-based clustering and (global and local) outlier detection methods.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Berardi:2015:UTR, author = "Giacomo Berardi and Andrea Esuli and Fabrizio Sebastiani", title = "Utility-Theoretic Ranking for Semiautomated Text Classification", journal = j-TKDD, volume = "10", number = "1", pages = "6:1--6:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2742548", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Semiautomated Text Classification (SATC) may be defined as the task of ranking a set D of automatically labelled textual documents in such a way that, if a human annotator validates (i.e., inspects and corrects where appropriate) the documents in a top-ranked portion of D with the goal of increasing the overall labelling accuracy of D, the expected increase is maximized. An obvious SATC strategy is to rank D so that the documents that the classifier has labelled with the lowest confidence are top ranked. In this work, we show that this strategy is suboptimal. We develop new utility-theoretic ranking methods based on the notion of validation gain, defined as the improvement in classification effectiveness that would derive by validating a given automatically labelled document. We also propose a new effectiveness measure for SATC-oriented ranking methods, based on the expected reduction in classification error brought about by partially validating a list generated by a given ranking method. We report the results of experiments showing that, with respect to the baseline method mentioned earlier, and according to the proposed measure, our utility-theoretic ranking methods can achieve substantially higher expected reductions in classification error.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2015:DIP, author = "Zhiwen Yu and Zhu Wang and Huilei He and Jilei Tian and Xinjiang Lu and Bin Guo", title = "Discovering Information Propagation Patterns in Microblogging Services", journal = j-TKDD, volume = "10", number = "1", pages = "7:1--7:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2742801", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "During the last decade, microblog has become an important social networking service with billions of users all over the world, acting as a novel and efficient platform for the creation and dissemination of real-time information. Modeling and revealing the information propagation patterns in microblogging services cannot only lead to more accurate understanding of user behaviors and provide insights into the underlying sociology, but also enable useful applications such as trending prediction, recommendation and filtering, spam detection and viral marketing. In this article, we aim to reveal the information propagation patterns in Sina Weibo, the biggest microblogging service in China. First, the cascade of each message is represented as a tree based on its retweeting process. Afterwards, we divide the information propagation pattern into two levels, that is, the macro level and the micro level. On one hand, the macro propagation patterns refer to general propagation modes that are extracted by grouping propagation trees based on hierarchical clustering. On the other hand, the micro propagation patterns are frequent information flow patterns that are discovered using tree-based mining techniques. Experimental results show that several interesting patterns are extracted, such as popular message propagation, artificial propagation, and typical information flows between different types of users.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2015:SMB, author = "Xianchao Zhang and Xiaotong Zhang and Han Liu", title = "Smart Multitask {Bregman} Clustering and Multitask Kernel Clustering", journal = j-TKDD, volume = "10", number = "1", pages = "8:1--8:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2747879", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Traditional clustering algorithms deal with a single clustering task on a single dataset. However, there are many related tasks in the real world, which motivates multitask clustering. Recently some multitask clustering algorithms have been proposed, and among them multitask Bregman clustering (MBC) is a very applicable method. MBC alternatively updates clusters and learns relationships between clusters of different tasks, and the two phases boost each other. However, the boosting does not always have positive effects on improving the clustering performance, it may also cause negative effects. Another issue of MBC is that it cannot deal with nonlinear separable data. In this article, we show that in MBC, the process of using cluster relationship to boost the cluster updating phase may cause negative effects, that is, cluster centroids may be skewed under some conditions. We propose a smart multitask Bregman clustering (S-MBC) algorithm which can identify the negative effects of the boosting and avoid the negative effects if they occur. We then propose a multitask kernel clustering (MKC) framework for nonlinear separable data by using a similar framework like MBC in the kernel space. We also propose a specific optimization method, which is quite different from that of MBC, to implement the MKC framework. Since MKC can also cause negative effects like MBC, we further extend the framework of MKC to a smart multitask kernel clustering (S-MKC) framework in a similar way that S-MBC is extended from MBC. We conduct experiments on 10 real world multitask clustering datasets to evaluate the performance of S-MBC and S-MKC. The results on clustering accuracy show that: (1) compared with the original MBC algorithm MBC, S-MBC and S-MKC perform much better; (2) compared with the convex discriminative multitask relationship clustering (DMTRC) algorithms DMTRC-L and DMTRC-R which also avoid negative transfer, S-MBC and S-MKC perform worse in the (ideal) case in which different tasks have the same cluster number and the empirical label marginal distribution in each task distributes evenly, but better or comparable in other (more general) cases. Moreover, S-MBC and S-MKC can work on the datasets in which different tasks have different number of clusters, violating the assumptions of DMTRC-L and DMTRC-R. The results on efficiency show that S-MBC and S-MKC consume more computational time than MBC and less computational time than DMTRC-L and DMTRC-R. Overall S-MBC and S-MKC are competitive compared with the state-of-the-art multitask clustering algorithms in synthetical terms of accuracy, efficiency and applicability.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wei:2015:MTP, author = "Wei Wei and Kathleen M. Carley", title = "Measuring Temporal Patterns in Dynamic Social Networks", journal = j-TKDD, volume = "10", number = "1", pages = "9:1--9:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2749465", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given social networks over time, how can we measure network activities across different timesteps with a limited number of metrics? We propose two classes of dynamic metrics for assessing temporal evolution patterns of agents in terms of persistency and emergence. For each class of dynamic metrics, we implement it using three different temporal aggregation models ranging from the most commonly used Average Aggregation Model to more the complex models such as the Exponential Aggregation Model. We argue that the problem of measuring temporal patterns can be formulated using Recency and Primacy effect, which is a concept used to characterize human cognitive processes. Experimental results show that the way metrics model Recency--Primacy effect is closely related to their abilities to measure temporal patterns. Furthermore, our results indicate that future network agent activities can be predicted based on history information using dynamic metrics. By conducting multiple experiments, we are also able to find an optimal length of history information that is most relevant to future activities. This optimal length is highly consistent within a dataset and can be used as an intrinsic metric to evaluate a dynamic social network.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2015:RAT, author = "Siyuan Liu and Qiang Qu and Shuhui Wang", title = "Rationality Analytics from Trajectories", journal = j-TKDD, volume = "10", number = "1", pages = "10:1--10:??", month = jul, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2735634", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Tue Jul 28 17:19:31 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The availability of trajectories tracking the geographical locations of people as a function of time offers an opportunity to study human behaviors. In this article, we study rationality from the perspective of user decision on visiting a point of interest (POI) which is represented as a trajectory. However, the analysis of rationality is challenged by a number of issues, for example, how to model a trajectory in terms of complex user decision processes? and how to detect hidden factors that have significant impact on the rational decision making? In this study, we propose Rationality Analysis Model (RAM) to analyze rationality from trajectories in terms of a set of impact factors. In order to automatically identify hidden factors, we propose a method, Collective Hidden Factor Retrieval (CHFR), which can also be generalized to parse multiple trajectories at the same time or parse individual trajectories of different time periods. Extensive experimental study is conducted on three large-scale real-life datasets (i.e., taxi trajectories, user shopping trajectories, and visiting trajectories in a theme park). The results show that the proposed methods are efficient, effective, and scalable. We also deploy a system in a large theme park to conduct a field study. Interesting findings and user feedback of the field study are provided to support other applications in user behavior mining and analysis, such as business intelligence and user management for marketing purposes.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jia:2015:SGR, author = "Adele Lu Jia and Siqi Shen and Ruud {Van De Bovenkamp} and Alexandru Iosup and Fernando Kuipers and Dick H. J. Epema", title = "Socializing by Gaming: Revealing Social Relationships in Multiplayer Online Games", journal = j-TKDD, volume = "10", number = "2", pages = "11:1--11:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2736698", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multiplayer Online Games (MOGs) like Defense of the Ancients and StarCraft II have attracted hundreds of millions of users who communicate, interact, and socialize with each other through gaming. In MOGs, rich social relationships emerge and can be used to improve gaming services such as match recommendation and game population retention, which are important for the user experience and the commercial value of the companies who run these MOGs. In this work, we focus on understanding social relationships in MOGs. We propose a graph model that is able to capture social relationships of a variety of types and strengths. We apply our model to real-world data collected from three MOGs that contain in total over ten years of behavioral history for millions of players and matches. We compare social relationships in MOGs across different game genres and with regular online social networks like Facebook. Taking match recommendation as an example application of our model, we propose SAMRA, a Socially Aware Match Recommendation Algorithm that takes social relationships into account. We show that our model not only improves the precision of traditional link prediction approaches, but also potentially helps players enjoy games to a higher extent.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Papagelis:2015:RSG, author = "Manos Papagelis", title = "Refining Social Graph Connectivity via Shortcut Edge Addition", journal = j-TKDD, volume = "10", number = "2", pages = "12:1--12:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2757281", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Small changes on the structure of a graph can have a dramatic effect on its connectivity. While in the traditional graph theory, the focus is on well-defined properties of graph connectivity, such as biconnectivity, in the context of a social graph, connectivity is typically manifested by its ability to carry on social processes. In this paper, we consider the problem of adding a small set of nonexisting edges ( shortcuts ) in a social graph with the main objective of minimizing its characteristic path length. This property determines the average distance between pairs of vertices and essentially controls how broadly information can propagate through a network. We formally define the problem of interest, characterize its hardness and propose a novel method, path screening, which quickly identifies important shortcuts to guide the augmentation of the graph. We devise a sampling-based variant of our method that can scale up the computation in larger graphs. The claims of our methods are formally validated. Through experiments on real and synthetic data, we demonstrate that our methods are a multitude of times faster than standard approaches, their accuracy outperforms sensible baselines and they can ease the spread of information in a network, for a varying range of conditions.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Hong:2015:CAR, author = "Liang Hong and Lei Zou and Cheng Zeng and Luming Zhang and Jian Wang and Jilei Tian", title = "Context-Aware Recommendation Using Role-Based Trust Network", journal = j-TKDD, volume = "10", number = "2", pages = "13:1--13:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2751562", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recommender systems have been studied comprehensively in both academic and industrial fields over the past decade. As user interests can be affected by context at any time and any place in mobile scenarios, rich context information becomes more and more important for personalized context-aware recommendations. Although existing context-aware recommender systems can make context-aware recommendations to some extent, they suffer several inherent weaknesses: (1) Users' context-aware interests are not modeled realistically, which reduces the recommendation quality; (2) Current context-aware recommender systems ignore trust relations among users. Trust relations are actually context-aware and associated with certain aspects (i.e., categories of items) in mobile scenarios. In this article, we define a term role to model common context-aware interests among a group of users. We propose an efficient role mining algorithm to mine roles from a ``user-context-behavior'' matrix, and a role-based trust model to calculate context-aware trust value between two users. During online recommendation, given a user u in a context c, an efficient weighted set similarity query (WSSQ) algorithm is designed to build u 's role-based trust network in context c. Finally, we make recommendations to u based on u 's role-based trust network by considering both context-aware roles and trust relations. Extensive experiments demonstrate that our recommendation approach outperforms the state-of-the-art methods in both effectiveness and efficiency.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2015:OBF, author = "Lei Zhang and Ping Luo and Linpeng Tang and Enhong Chen and Qi Liu and Min Wang and Hui Xiong", title = "Occupancy-Based Frequent Pattern Mining", journal = j-TKDD, volume = "10", number = "2", pages = "14:1--14:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2753765", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Frequent pattern mining is an important data mining problem with many broad applications. Most studies in this field use support (frequency) to measure the popularity of a pattern, namely the fraction of transactions or sequences that include the pattern in a data set. In this study, we introduce a new interesting measure, namely occupancy, to measure the completeness of a pattern in its supporting transactions or sequences. This is motivated by some real-world pattern recommendation applications in which an interesting pattern should not only be frequent, but also occupies a large portion of its supporting transactions or sequences. With the definition of occupancy we call a pattern dominant if its occupancy value is above a user-specified threshold. Then, our task is to identify the qualified patterns which are both dominant and frequent. Also, we formulate the problem of mining top-k qualified patterns, that is, finding k qualified patterns with maximum values on a user-defined function of support and occupancy, for example, weighted sum of support and occupancy. The challenge to these tasks is that the value of occupancy does not change monotonically when more items are appended to a given pattern. Therefore, we propose a general algorithm called DOFRA (DOminant and FRequent pattern mining Algorithm) for mining these qualified patterns, which explores the upper bound properties on occupancy to drastically reduce the search process. Finally, we show the effectiveness of DOFRA in two real-world applications and also demonstrate the efficiency of DOFRA on several real and large synthetic datasets.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2015:AAS, author = "Hung-Hsuan Chen and C. Lee Giles", title = "{ASCOS++}: an Asymmetric Similarity Measure for Weighted Networks to Address the Problem of {SimRank}", journal = j-TKDD, volume = "10", number = "2", pages = "15:1--15:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2776894", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/pagerank.bib; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this article, we explore the relationships among digital objects in terms of their similarity based on vertex similarity measures. We argue that SimRank --- a famous similarity measure --- and its families, such as P-Rank and SimRank++, fail to capture similar node pairs in certain conditions, especially when two nodes can only reach each other through paths of odd lengths. We present new similarity measures ASCOS and ASCOS++ to address the problem. ASCOS outputs a more complete similarity score than SimRank and SimRank's families. ASCOS++ enriches ASCOS to include edge weight into the measure, giving all edges and network weights an opportunity to make their contribution. We show that both ASCOS++ and ASCOS can be reformulated and applied on a distributed environment for parallel contribution. Experimental results show that ASCOS++ reports a better score than SimRank and several famous similarity measures. Finally, we re-examine previous use cases of SimRank, and explain appropriate and inappropriate use cases. We suggest future SimRank users following the rules proposed here before na{\"\i}vely applying it. We also discuss the relationship between ASCOS++ and PageRank.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zafarani:2015:UIA, author = "Reza Zafarani and Lei Tang and Huan Liu", title = "User Identification Across Social Media", journal = j-TKDD, volume = "10", number = "2", pages = "16:1--16:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2747880", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "People use various social media sites for different purposes. The information on each site is often partial. When sources of complementary information are integrated, a better profile of a user can be built. This profile can help improve online services such as advertising across sites. To integrate these sources of information, it is necessary to identify individuals across social media sites. This paper aims to address the cross-media user identification problem. We provide evidence on the existence of a mapping among identities of individuals across social media sites, study the feasibility of finding this mapping, and illustrate and develop means for finding this mapping. Our studies show that effective approaches that exploit information redundancies due to users' unique behavioral patterns can be utilized to find such a mapping. This study paves the way for analysis and mining across social networking sites, and facilitates the creation of novel online services across sites. In particular, recommending friends and advertising across networks, analyzing information diffusion across sites, and studying specific user behavior such as user migration across sites in social media are one of the many areas that can benefit from the results of this study.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Li:2015:RUC, author = "Lei Li and Wei Peng and Saurabh Kataria and Tong Sun and Tao Li", title = "Recommending Users and Communities in Social Media", journal = j-TKDD, volume = "10", number = "2", pages = "17:1--17:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2757282", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Social media has become increasingly prevalent in the last few years, not only enabling people to connect with each other by social links, but also providing platforms for people to share information and interact over diverse topics. Rich user-generated information, for example, users' relationships and daily posts, are often available in most social media service websites. Given such information, a challenging problem is to provide reasonable user and community recommendation for a target user, and consequently, help the target user engage in the daily discussions and activities with his/her friends or like-minded people. In this article, we propose a unified framework of recommending users and communities that utilizes the information in social media. Given a user's profile or a set of keywords as input, our framework is capable of recommending influential users and topic-cohesive interactive communities that are most relevant to the given user or keywords. With the proposed framework, users can find other individuals or communities sharing similar interests, and then have more interaction with these users or within the communities. We present a generative topic model to discover user-oriented and community-oriented topics simultaneously, which enables us to capture the exact topical interests of users, as well as the focuses of communities. Extensive experimental evaluation and case studies on a dataset collected from Twitter demonstrate the effectiveness of our proposed framework compared with other probabilistic-topic-model-based recommendation methods.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2015:GGA, author = "Rose Yu and Xinran He and Yan Liu", title = "{GLAD}: Group Anomaly Detection in Social Media Analysis", journal = j-TKDD, volume = "10", number = "2", pages = "18:1--18:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2811268", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Traditional anomaly detection on social media mostly focuses on individual point anomalies while anomalous phenomena usually occur in groups. Therefore, it is valuable to study the collective behavior of individuals and detect group anomalies. Existing group anomaly detection approaches rely on the assumption that the groups are known, which can hardly be true in real world social media applications. In this article, we take a generative approach by proposing a hierarchical Bayes model: Group Latent Anomaly Detection (GLAD) model. GLAD takes both pairwise and point-wise data as input, automatically infers the groups and detects group anomalies simultaneously. To account for the dynamic properties of the social media data, we further generalize GLAD to its dynamic extension d-GLAD. We conduct extensive experiments to evaluate our models on both synthetic and real world datasets. The empirical results demonstrate that our approach is effective and robust in discovering latent groups and detecting group anomalies.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chakrabarti:2015:BPL, author = "Aniket Chakrabarti and Venu Satuluri and Atreya Srivathsan and Srinivasan Parthasarathy", title = "A {Bayesian} Perspective on Locality Sensitive Hashing with Extensions for Kernel Methods", journal = j-TKDD, volume = "10", number = "2", pages = "19:1--19:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2778990", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/hash.bib; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given a collection of objects and an associated similarity measure, the all-pairs similarity search problem asks us to find all pairs of objects with similarity greater than a certain user-specified threshold. In order to reduce the number of candidates to search, locality-sensitive hashing (LSH) based indexing methods are very effective. However, most such methods only use LSH for the first phase of similarity search --- that is, efficient indexing for candidate generation. In this article, we present BayesLSH, a principled Bayesian algorithm for the subsequent phase of similarity search --- performing candidate pruning and similarity estimation using LSH. A simpler variant, BayesLSH-Lite, which calculates similarities exactly, is also presented. Our algorithms are able to quickly prune away a large majority of the false positive candidate pairs, leading to significant speedups over baseline approaches. For BayesLSH, we also provide probabilistic guarantees on the quality of the output, both in terms of accuracy and recall. Finally, the quality of BayesLSH's output can be easily tuned and does not require any manual setting of the number of hashes to use for similarity estimation, unlike standard approaches. For two state-of-the-art candidate generation algorithms, AllPairs and LSH, BayesLSH enables significant speedups, typically in the range 2 $ \times $ --20 $ \times $ for a wide variety of datasets. We also extend the BayesLSH algorithm for kernel methods --- in which the similarity between two data objects is defined by a kernel function. Since the embedding of data points in the transformed kernel space is unknown, algorithms such as AllPairs which rely on building inverted index structure for fast similarity search do not work with kernel functions. Exhaustive search across all possible pairs is also not an option since the dataset can be huge and computing the kernel values for each pair can be prohibitive. We propose K-BayesLSH an all-pairs similarity search problem for kernel functions. K-BayesLSH leverages a recently proposed idea --- kernelized locality sensitive hashing (KLSH) --- for hash bit computation and candidate generation, and uses the aforementioned BayesLSH idea for candidate pruning and similarity estimation. We ran a broad spectrum of experiments on a variety of datasets drawn from different domains and with distinct kernels and find a speedup of 2 $ \times $ --7 $ \times $ over vanilla KLSH.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2015:DAV, author = "Yao Zhang and B. Aditya Prakash", title = "Data-Aware Vaccine Allocation Over Large Networks", journal = j-TKDD, volume = "10", number = "2", pages = "20:1--20:??", month = oct, year = "2015", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2803176", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Oct 26 17:19:18 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given a graph, like a social/computer network or the blogosphere, in which an infection (or meme or virus) has been spreading for some time, how to select the k best nodes for immunization/quarantining immediately? Most previous works for controlling propagation (say via immunization) have concentrated on developing strategies for vaccination preemptively before the start of the epidemic. While very useful to provide insights in to which baseline policies can best control an infection, they may not be ideal to make real-time decisions as the infection is progressing. In this paper, we study how to immunize healthy nodes, in the presence of already infected nodes. Efficient algorithms for such a problem can help public-health experts make more informed choices, tailoring their decisions to the actual distribution of the epidemic on the ground. First we formulate the Data-Aware Vaccination problem, and prove it is NP-hard and also that it is hard to approximate. Secondly, we propose three effective polynomial-time heuristics DAVA, DAVA-prune and DAVA-fast, of varying degrees of efficiency and performance. Finally, we also demonstrate the scalability and effectiveness of our algorithms through extensive experiments on multiple real networks including large epidemiology datasets (containing millions of interactions). Our algorithms show substantial gains of up to ten times more healthy nodes at the end against many other intuitive and nontrivial competitors.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Rowe:2016:MUD, author = "Matthew Rowe", title = "Mining User Development Signals for Online Community Churner Detection", journal = j-TKDD, volume = "10", number = "3", pages = "21:1--21:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2798730", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Churners are users who stop using a given service after previously signing up. In the domain of telecommunications and video games, churners represent a loss of revenue as a user leaving indicates that they will no longer pay for the service. In the context of online community platforms (e.g., community message boards, social networking sites, question--answering systems, etc.), the churning of a user can represent different kinds of loss: of social capital, of expertise, or of a vibrant individual who is a mediator for interaction and communication. Detecting which users are likely to churn from online communities, therefore, enables community managers to offer incentives to entice those users back; as retention is less expensive than re-signing users up. In this article, we tackle the task of detecting churners on four online community platforms by mining user development signals. These signals explain how users have evolved along different dimensions (i.e., social and lexical) relative to their prior behaviour and the community in which they have interacted. We present a linear model, based upon elastic-net regularisation, that uses extracted features from the signals to detect churners. Our evaluation of this model against several state of the art baselines, including our own prior work, empirically demonstrates the superior performance that this approach achieves for several experimental settings. This article presents a novel approach to churn prediction that takes a different route from existing approaches that are based on measuring static social network properties of users (e.g., centrality, in-degree, etc.).", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Prat-Perez:2016:PTT, author = "Arnau Prat-P{\'e}rez and David Dominguez-Sal and Josep-M. Brunat and Josep-Lluis Larriba-Pey", title = "Put Three and Three Together: Triangle-Driven Community Detection", journal = j-TKDD, volume = "10", number = "3", pages = "22:1--22:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2775108", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Community detection has arisen as one of the most relevant topics in the field of graph data mining due to its applications in many fields such as biology, social networks, or network traffic analysis. Although the existing metrics used to quantify the quality of a community work well in general, under some circumstances, they fail at correctly capturing such notion. The main reason is that these metrics consider the internal community edges as a set, but ignore how these actually connect the vertices of the community. We propose the Weighted Community Clustering ( WCC ), which is a new community metric that takes the triangle instead of the edge as the minimal structural motif indicating the presence of a strong relation in a graph. We theoretically analyse WCC in depth and formally prove, by means of a set of properties, that the maximization of WCC guarantees communities with cohesion and structure. In addition, we propose Scalable Community Detection (SCD), a community detection algorithm based on WCC, which is designed to be fast and scalable on SMP machines, showing experimentally that WCC correctly captures the concept of community in social networks using real datasets. Finally, using ground-truth data, we show that SCD provides better quality than the best disjoint community detection algorithms of the state of the art while performing faster.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Guo:2016:MDM, author = "Zhen Guo and Zhongfei (Mark) Zhang and Eric P. Xing and Christos Faloutsos", title = "Multimodal Data Mining in a Multimedia Database Based on Structured Max Margin Learning", journal = j-TKDD, volume = "10", number = "3", pages = "23:1--23:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2742549", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Mining knowledge from a multimedia database has received increasing attentions recently since huge repositories are made available by the development of the Internet. In this article, we exploit the relations among different modalities in a multimedia database and present a framework for general multimodal data mining problem where image annotation and image retrieval are considered as the special cases. Specifically, the multimodal data mining problem can be formulated as a structured prediction problem where we learn the mapping from an input to the structured and interdependent output variables. In addition, in order to reduce the demanding computation, we propose a new max margin structure learning approach called Enhanced Max Margin Learning (EMML) framework, which is much more efficient with a much faster convergence rate than the existing max margin learning methods, as verified through empirical evaluations. Furthermore, we apply EMML framework to develop an effective and efficient solution to the multimodal data mining problem that is highly scalable in the sense that the query response time is independent of the database scale. The EMML framework allows an efficient multimodal data mining query in a very large scale multimedia database, and excels many existing multimodal data mining methods in the literature that do not scale up at all. The performance comparison with a state-of-the-art multimodal data mining method is reported for the real-world image databases.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Myers:2016:DAK, author = "Risa B. Myers and John C. Frenzel MD and Joseph R. Ruiz Md and Christopher M. Jermaine", title = "Do Anesthesiologists Know What They Are Doing? {Mining} a Surgical Time-Series Database to Correlate Expert Assessment with Outcomes", journal = j-TKDD, volume = "10", number = "3", pages = "24:1--24:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2822897", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Anesthesiologists are taught to carefully manage patient vital signs during surgery. Unfortunately, there is little empirical evidence that vital sign management, as currently practiced, is correlated with patient outcomes. We seek to validate or repudiate current clinical practice and determine whether or not clinician evaluation of surgical vital signs correlate with outcomes. Using a database of over 90,000 cases, we attempt to determine whether those cases that anesthesiologists would subjectively decide are ``low quality'' are more likely to result in negative outcomes. The problem reduces to one of multi-dimensional time-series classification. Our approach is to have a set of expert anesthesiologists independently label a small number of training cases, from which we build classifiers and label all 90,000 cases. We then use the labeling to search for correlation with outcomes and compare the prevalence of important 30-day outcomes between providers. To mimic the providers' quality labels, we consider several standard classification methods, such as dynamic time warping in conjunction with a kNN classifier, as well as complexity invariant distance, and a regression based upon the feature extraction methods outlined by Mao et al. 2012 (using features such as time-series mean, standard deviation, skew, etc.). We also propose a new feature selection mechanism that learns a hidden Markov model to segment the time series; the fraction of time that each series spends in each state is used to label the series using a regression-based classifier. In the end, we obtain strong, empirical evidence that current best practice is correlated with reduced negative patient outcomes. We also learn that all of the experts were able to significantly separate cases by outcome, with higher prevalence of negative 30-day outcomes in the cases labeled as ``low quality'' for almost all of the outcomes investigated.", acknowledgement = ack-nhfb, articleno = "24", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Namata:2016:CGI, author = "Galileo Mark Namata and Ben London and Lise Getoor", title = "Collective Graph Identification", journal = j-TKDD, volume = "10", number = "3", pages = "25:1--25:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2818378", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Data describing networks---such as communication networks, transaction networks, disease transmission networks, collaboration networks, etc.---are becoming increasingly available. While observational data can be useful, it often only hints at the actual underlying process that governs interactions and attributes. For example, an email communication network provides insight into its users and their relationships, but is not the same as the ``real'' underlying social network. In this article, we introduce the problem of graph identification, i.e., discovering the latent graph structure underlying an observed network. We cast the problem as a probabilistic inference task, in which we must infer the nodes, edges, and node labels of a hidden graph, based on evidence. This entails solving several canonical problems in network analysis: entity resolution (determining when two observations correspond to the same entity), link prediction (inferring the existence of links), and node labeling (inferring hidden attributes). While each of these subproblems has been well studied in isolation, here we consider them as a single, collective task. We present a simple, yet novel, approach to address all three subproblems simultaneously. Our approach, which we refer to as C$^3$, consists of a collection of Coupled Collective Classifiers that are applied iteratively to propagate inferred information among the subproblems. We consider variants of C$^3$ using different learning and inference techniques and empirically demonstrate that C$^3$ is superior, both in terms of predictive accuracy and running time, to state-of-the-art probabilistic approaches on four real problems.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Subbian:2016:MIU, author = "Karthik Subbian and Charu Aggarwal and Jaideep Srivastava", title = "Mining Influencers Using Information Flows in Social Streams", journal = j-TKDD, volume = "10", number = "3", pages = "26:1--26:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2815625", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The problem of discovering information flow trends in social networks has become increasingly relevant due to the increasing amount of content in online social networks, and its relevance as a tool for research into the content trends analysis in the network. An important part of this analysis is to determine the key patterns of flow in the underlying network. Almost all the work in this area has focused on fixed models of the network structure, and edge-based transmission between nodes. In this article, we propose a fully content-centered model of flow analysis in networks, in which the analysis is based on actual content transmissions in the underlying social stream, rather than a static model of transmission on the edges. First, we introduce the problem of influence analysis in the context of information flow in networks. We then propose a novel algorithm InFlowMine to discover the information flow patterns in the network and demonstrate the effectiveness of the discovered information flows using an influence mining application. This application illustrates the flexibility and effectiveness of our information flow model to find topic- or network-specific influencers, or their combinations. We empirically show that our information flow mining approach is effective and efficient than the existing methods on a number of different measures.", acknowledgement = ack-nhfb, articleno = "26", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Angiulli:2016:TGU, author = "Fabrizio Angiulli and Fabio Fassetti", title = "Toward Generalizing the Unification with Statistical Outliers: The Gradient Outlier Factor Measure", journal = j-TKDD, volume = "10", number = "3", pages = "27:1--27:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2829956", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "In this work, we introduce a novel definition of outlier, namely the Gradient Outlier Factor (or GOF), with the aim to provide a definition that unifies with the statistical one on some standard distributions but has a different behavior in the presence of mixture distributions. Intuitively, the GOF score measures the probability to stay in the neighborhood of a certain object. It is directly proportional to the density and inversely proportional to the variation of the density. We derive formal properties under which the GOF definition unifies the statistical outlier definition and show that the unification holds for some standard distributions, while the GOF is able to capture tails in the presence of different distributions even if their densities sensibly differ. Moreover, we provide a probabilistic interpretation of the GOF score, by means of the notion of density of the data density. Experimental results confirm that there are scenarios in which the novel definition can be profitably employed. To the best of our knowledge, except for distance-based outlier, no other data mining outlier definition has a so clearly established relationship with statistical outliers.", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Koutra:2016:DPM, author = "Danai Koutra and Neil Shah and Joshua T. Vogelstein and Brian Gallagher and Christos Faloutsos", title = "{DeltaCon}: Principled Massive-Graph Similarity Function with Attribution", journal = j-TKDD, volume = "10", number = "3", pages = "28:1--28:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2824443", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "How much has a network changed since yesterday? How different is the wiring of Bob's brain (a left-handed male) and Alice's brain (a right-handed female), and how is it different? Graph similarity with given node correspondence, i.e., the detection of changes in the connectivity of graphs, arises in numerous settings. In this work, we formally state the axioms and desired properties of the graph similarity functions, and evaluate when state-of-the-art methods fail to detect crucial connectivity changes in graphs. We propose D eltaCon, a principled, intuitive, and scalable algorithm that assesses the similarity between two graphs on the same nodes (e.g., employees of a company, customers of a mobile carrier). In conjunction, we propose DeltaCon-Attr, a related approach that enables attribution of change or dissimilarity to responsible nodes and edges. Experiments on various synthetic and real graphs showcase the advantages of our method over existing similarity measures. Finally, we employ DeltaCon and DeltaCon-Attr on real applications: (a) we classify people to groups of high and low creativity based on their brain connectivity graphs, (b) do temporal anomaly detection in the who-emails-whom Enron graph and find the top culprits for the changes in the temporal corporate email graph, and (c) recover pairs of test-retest large brain scans ( {\sim}17M edges, up to 90M edges) for 21 subjects.", acknowledgement = ack-nhfb, articleno = "28", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhao:2016:MPA, author = "Wayne Xin Zhao and Jinpeng Wang and Yulan He and Ji-Rong Wen and Edward Y. Chang and Xiaoming Li", title = "Mining Product Adopter Information from Online Reviews for Improving Product Recommendation", journal = j-TKDD, volume = "10", number = "3", pages = "29:1--29:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2842629", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "We present in this article an automated framework that extracts product adopter information from online reviews and incorporates the extracted information into feature-based matrix factorization for more effective product recommendation. In specific, we propose a bootstrapping approach for the extraction of product adopters from review text and categorize them into a number of different demographic categories. The aggregated demographic information of many product adopters can be used to characterize both products and users in the form of distributions over different demographic categories. We further propose a graph-based method to iteratively update user- and product-related distributions more reliably in a heterogeneous user--product graph and incorporate them as features into the matrix factorization approach for product recommendation. Our experimental results on a large dataset crawled from J ingDong, the largest B2C e-commerce website in China, show that our proposed framework outperforms a number of competitive baselines for product recommendation.", acknowledgement = ack-nhfb, articleno = "29", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Duarte:2016:AMR, author = "Jo{\~a}o Duarte and Jo{\~a}o Gama and Albert Bifet", title = "Adaptive Model Rules From High-Speed Data Streams", journal = j-TKDD, volume = "10", number = "3", pages = "30:1--30:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2829955", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Decision rules are one of the most expressive and interpretable models for machine learning. In this article, we present Adaptive Model Rules (AMRules), the first stream rule learning algorithm for regression problems. In AMRules, the antecedent of a rule is a conjunction of conditions on the attribute values, and the consequent is a linear combination of the attributes. In order to maintain a regression model compatible with the most recent state of the process generating data, each rule uses a Page-Hinkley test to detect changes in this process and react to changes by pruning the rule set. Online learning might be strongly affected by outliers. AMRules is also equipped with outliers detection mechanisms to avoid model adaption using anomalous examples. In the experimental section, we report the results of AMRules on benchmark regression problems, and compare the performance of our system with other streaming regression algorithms.", acknowledgement = ack-nhfb, articleno = "30", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Lu:2016:SCB, author = "Faming Lu and Qingtian Zeng and Hua Duan", title = "Synchronization-Core-Based Discovery of Processes with Decomposable Cyclic Dependencies", journal = j-TKDD, volume = "10", number = "3", pages = "31:1--31:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2845086", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Traditional process discovery techniques mine process models based upon event traces giving little consideration to workflow relevant data recorded in event logs. The neglect of such information usually leads to incorrect discovered models, especially when activities have decomposable cyclic dependencies. To address this problem, the recorded workflow relevant data and decision tree learning technique are utilized to classify cases into case clusters. Each case cluster contains causality and concurrency activity dependencies only. Then, a set of activity ordering relations are derived based on case clusters. And a synchronization-core-based process model is discovered from the ordering relations and composite cases. Finally, the discovered model is transformed to a BPMN model. The proposed approach is validated with a medical treatment process and an open event log. Meanwhile, a prototype system is presented.", acknowledgement = ack-nhfb, articleno = "31", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Liu:2016:EAW, author = "Yashu Liu and Jie Wang and Jieping Ye", title = "An Efficient Algorithm For Weak Hierarchical Lasso", journal = j-TKDD, volume = "10", number = "3", pages = "32:1--32:??", month = feb, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2791295", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Thu Feb 25 05:56:34 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Linear regression is a widely used tool in data mining and machine learning. In many applications, fitting a regression model with only linear effects may not be sufficient for predictive or explanatory purposes. One strategy that has recently received increasing attention in statistics is to include feature interactions to capture the nonlinearity in the regression model. Such model has been applied successfully in many biomedical applications. One major challenge in the use of such model is that the data dimensionality is significantly higher than the original data, resulting in the small sample size large dimension problem. Recently, weak hierarchical Lasso, a sparse interaction regression model, is proposed that produces a sparse and hierarchical structured estimator by exploiting the Lasso penalty and a set of hierarchical constraints. However, the hierarchical constraints make it a non-convex problem and the existing method finds the solution to its convex relaxation, which needs additional conditions to guarantee the hierarchical structure. In this article, we propose to directly solve the non-convex weak hierarchical Lasso by making use of the General Iterative Shrinkage and Thresholding (GIST) optimization framework, which has been shown to be efficient for solving non-convex sparse formulations. The key step in GIST is to compute a sequence of proximal operators. One of our key technical contributions is to show that the proximal operator associated with the non-convex weak hierarchical Lasso admits a closed-form solution. However, a naive approach for solving each subproblem of the proximal operator leads to a quadratic time complexity, which is not desirable for large-size problems. We have conducted extensive experiments on both synthetic and real datasets. Results show that our proposed algorithm is much more efficient and effective than its convex relaxation. To this end, we further develop an efficient algorithm for computing the subproblems with a linearithmic time complexity. In addition, we extend the technique to perform the optimization-based hierarchical testing of pairwise interactions for binary classification problems, which is essentially the proximal operator associated with weak hierarchical Lasso. Simulation studies show that the non-convex hierarchical testing framework outperforms the convex relaxation when a hierarchical structure exists between main effects and interactions.", acknowledgement = ack-nhfb, articleno = "32", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wang:2016:ISI, author = "Wei Wang and Jure Leskovec", title = "Introduction to the Special Issue of Best Papers in {ACM SIGKDD 2014}", journal = j-TKDD, volume = "10", number = "4", pages = "33:1--33:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2936718", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "33", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xu:2016:PSP, author = "Silei Xu and John C. S. Lui", title = "Product Selection Problem: Improve Market Share by Learning Consumer Behavior", journal = j-TKDD, volume = "10", number = "4", pages = "34:1--34:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2753764", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "It is often crucial for manufacturers to decide what products to produce so that they can increase their market share in an increasingly fierce market. To decide which products to produce, manufacturers need to analyze the consumers' requirements and how consumers make their purchase decisions so that the new products will be competitive in the market. In this paper, we first present a general distance-based product adoption model to capture consumers' purchase behavior. Using this model, various distance metrics can be used to describe different real life purchase behavior. We then provide a learning algorithm to decide which set of distance metrics one should use when we are given some accessible historical purchase data. Based on the product adoption model, we formalize the k most marketable products (or k- MMP ) selection problem and formally prove that the problem is NP-hard. To tackle this problem, we propose an efficient greedy-based approximation algorithm with a provable solution guarantee. Using submodularity analysis, we prove that our approximation algorithm can achieve at least 63\% of the optimal solution. We apply our algorithm on both synthetic datasets and real-world datasets (TripAdvisor.com), and show that our algorithm can easily achieve five or more orders of speedup over the exhaustive search and achieve about 96\% of the optimal solution on average. Our experiments also demonstrate the robustness of our distance metric learning method, and illustrate how one can adopt it to improve the accuracy of product selection.", acknowledgement = ack-nhfb, articleno = "34", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Jiang:2016:CSB, author = "Meng Jiang and Peng Cui and Alex Beutel and Christos Faloutsos and Shiqiang Yang", title = "Catching Synchronized Behaviors in Large Networks: a Graph Mining Approach", journal = j-TKDD, volume = "10", number = "4", pages = "35:1--35:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2746403", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Given a directed graph of millions of nodes, how can we automatically spot anomalous, suspicious nodes judging only from their connectivity patterns? Suspicious graph patterns show up in many applications, from Twitter users who buy fake followers, manipulating the social network, to botnet members performing distributed denial of service attacks, disturbing the network traffic graph. We propose a fast and effective method, C atchSync, which exploits two of the tell-tale signs left in graphs by fraudsters: (a) synchronized behavior: suspicious nodes have extremely similar behavior patterns because they are often required to perform some task together (such as follow the same user); and (b) rare behavior: their connectivity patterns are very different from the majority. We introduce novel measures to quantify both concepts (``synchronicity'' and ``normality'') and we propose a parameter-free algorithm that works on the resulting synchronicity-normality plots. Thanks to careful design, CatchSync has the following desirable properties: (a) it is scalable to large datasets, being linear in the graph size; (b) it is parameter free; and (c) it is side-information-oblivious: it can operate using only the topology, without needing labeled data, nor timing information, and the like., while still capable of using side information if available. We applied CatchSync on three large, real datasets, 1-billion-edge Twitter social graph, 3-billion-edge, and 12-billion-edge Tencent Weibo social graphs, and several synthetic ones; CatchSync consistently outperforms existing competitors, both in detection accuracy by 36\% on Twitter and 20\% on Tencent Weibo, as well as in speed.", acknowledgement = ack-nhfb, articleno = "35", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wei:2016:HTH, author = "Ying Wei and Yangqiu Song and Yi Zhen and Bo Liu and Qiang Yang", title = "Heterogeneous Translated Hashing: a Scalable Solution Towards Multi-Modal Similarity Search", journal = j-TKDD, volume = "10", number = "4", pages = "36:1--36:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2744204", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/hash.bib; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multi-modal similarity search has attracted considerable attention to meet the need of information retrieval across different types of media. To enable efficient multi-modal similarity search in large-scale databases recently, researchers start to study multi-modal hashing. Most of the existing methods are applied to search across multi-views among which explicit correspondence is provided. Given a multi-modal similarity search task, we observe that abundant multi-view data can be found on the Web which can serve as an auxiliary bridge. In this paper, we propose a Heterogeneous Translated Hashing (HTH) method with such auxiliary bridge incorporated not only to improve current multi-view search but also to enable similarity search across heterogeneous media which have no direct correspondence. HTH provides more flexible and discriminative ability by embedding heterogeneous media into different Hamming spaces, compared to almost all existing methods that map heterogeneous data in a common Hamming space. We formulate a joint optimization model to learn hash functions embedding heterogeneous media into different Hamming spaces, and a translator aligning different Hamming spaces. The extensive experiments on two real-world datasets, one publicly available dataset of Flickr, and the other MIRFLICKR-Yahoo Answers dataset, highlight the effectiveness and efficiency of our algorithm.", acknowledgement = ack-nhfb, articleno = "36", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tong:2016:GES, author = "Hanghang Tong and Fei Wang and Munmun De Choudhury and Zoran Obradovic", title = "Guest Editorial: Special Issue on Connected Health at Big Data Era {(BigChat)}: a {TKDD} Special Issue", journal = j-TKDD, volume = "10", number = "4", pages = "37:1--37:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2912122", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", acknowledgement = ack-nhfb, articleno = "37", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Xiong:2016:KIT, author = "Feiyu Xiong and Moshe Kam and Leonid Hrebien and Beilun Wang and Yanjun Qi", title = "Kernelized Information-Theoretic Metric Learning for Cancer Diagnosis Using High-Dimensional Molecular Profiling Data", journal = j-TKDD, volume = "10", number = "4", pages = "38:1--38:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2789212", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "With the advancement of genome-wide monitoring technologies, molecular expression data have become widely used for diagnosing cancer through tumor or blood samples. When mining molecular signature data, the process of comparing samples through an adaptive distance function is fundamental but difficult, as such datasets are normally heterogeneous and high dimensional. In this article, we present kernelized information-theoretic metric learning (KITML) algorithms that optimize a distance function to tackle the cancer diagnosis problem and scale to high dimensionality. By learning a nonlinear transformation in the input space implicitly through kernelization, KITML permits efficient optimization, low storage, and improved learning of distance metric. We propose two novel applications of KITML for diagnosing cancer using high-dimensional molecular profiling data: (1) for sample-level cancer diagnosis, the learned metric is used to improve the performance of k -nearest neighbor classification; and (2) for estimating the severity level or stage of a group of samples, we propose a novel set-based ranking approach to extend KITML. For the sample-level cancer classification task, we have evaluated on 14 cancer gene microarray datasets and compared with eight other state-of-the-art approaches. The results show that our approach achieves the best overall performance for the task of molecular-expression-driven cancer sample diagnosis. For the group-level cancer stage estimation, we test the proposed set-KITML approach using three multi-stage cancer microarray datasets, and correctly estimated the stages of sample groups for all three studies.", acknowledgement = ack-nhfb, articleno = "38", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yang:2016:JML, author = "Pei Yang and Hongxia Yang and Haoda Fu and Dawei Zhou and Jieping Ye and Theodoros Lappas and Jingrui He", title = "Jointly Modeling Label and Feature Heterogeneity in Medical Informatics", journal = j-TKDD, volume = "10", number = "4", pages = "39:1--39:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2768831", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multiple types of heterogeneity including label heterogeneity and feature heterogeneity often co-exist in many real-world data mining applications, such as diabetes treatment classification, gene functionality prediction, and brain image analysis. To effectively leverage such heterogeneity, in this article, we propose a novel graph-based model for Learning with both Label and Feature heterogeneity, namely L$^2$F. It models the label correlation by requiring that any two label-specific classifiers behave similarly on the same views if the associated labels are similar, and imposes the view consistency by requiring that view-based classifiers generate similar predictions on the same examples. The objective function for L$^2$F is jointly convex. To solve the optimization problem, we propose an iterative algorithm, which is guaranteed to converge to the global optimum. One appealing feature of L$^2$F is that it is capable of handling data with missing views and labels. Furthermore, we analyze its generalization performance based on Rademacher complexity, which sheds light on the benefits of jointly modeling the label and feature heterogeneity. Experimental results on various biomedical datasets show the effectiveness of the proposed approach.", acknowledgement = ack-nhfb, articleno = "39", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{WU:2016:MDN, author = "Yubao WU and Xiaofeng Zhu and Li Li and Wei Fan and Ruoming Jin and Xiang Zhang", title = "Mining Dual Networks: Models, Algorithms, and Applications", journal = j-TKDD, volume = "10", number = "4", pages = "40:1--40:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2785970", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Finding the densest subgraph in a single graph is a fundamental problem that has been extensively studied. In many emerging applications, there exist dual networks. For example, in genetics, it is important to use protein interactions to interpret genetic interactions. In this application, one network represents physical interactions among nodes, for example, protein--protein interactions, and another network represents conceptual interactions, for example, genetic interactions. Edges in the conceptual network are usually derived based on certain correlation measure or statistical test measuring the strength of the interaction. Two nodes with strong conceptual interaction may not have direct physical interaction. In this article, we propose the novel dual-network model and investigate the problem of finding the densest connected subgraph (DCS), which has the largest density in the conceptual network and is also connected in the physical network. Density in the conceptual network represents the average strength of the measured interacting signals among the set of nodes. Connectivity in the physical network shows how they interact physically. Such pattern cannot be identified using the existing algorithms for a single network. We show that even though finding the densest subgraph in a single network is polynomial time solvable, the DCS problem is NP-hard. We develop a two-step approach to solve the DCS problem. In the first step, we effectively prune the dual networks, while guarantee that the optimal solution is contained in the remaining networks. For the second step, we develop two efficient greedy methods based on different search strategies to find the DCS. Different variations of the DCS problem are also studied. We perform extensive experiments on a variety of real and synthetic dual networks to evaluate the effectiveness and efficiency of the developed methods.", acknowledgement = ack-nhfb, articleno = "40", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Cui:2016:BOQ, author = "Licong Cui and Shiqiang Tao and Guo-Qiang Zhang", title = "Biomedical Ontology Quality Assurance Using a Big Data Approach", journal = j-TKDD, volume = "10", number = "4", pages = "41:1--41:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2768830", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "This article presents recent progresses made in using scalable cloud computing environment, Hadoop and MapReduce, to perform ontology quality assurance (OQA), and points to areas of future opportunity. The standard sequential approach used for implementing OQA methods can take weeks if not months for exhaustive analyses for large biomedical ontological systems. With OQA methods newly implemented using massively parallel algorithms in the MapReduce framework, several orders of magnitude in speed-up can be achieved (e.g., from three months to three hours). Such dramatically reduced time makes it feasible not only to perform exhaustive structural analysis of large ontological hierarchies, but also to systematically track structural changes between versions for evolutional analysis. As an exemplar, progress is reported in using MapReduce to perform evolutional analysis and visualization on the Systemized Nomenclature of Medicine-Clinical Terms (SNOMED CT), a prominent clinical terminology system. Future opportunities in three areas are described: one is to extend the scope of MapReduce-based approach to existing OQA methods, especially for automated exhaustive structural analysis. The second is to apply our proposed MapReduce Pipeline for Lattice-based Evaluation (MaPLE) approach, demonstrated as an exemplar method for SNOMED CT, to other biomedical ontologies. The third area is to develop interfaces for reviewing results obtained by OQA methods and for visualizing ontological alignment and evolution, which can also take advantage of cloud computing technology to systematically pre-compute computationally intensive jobs in order to increase performance during user interactions with the visualization interface. Advances in these directions are expected to better support the ontological engineering lifecycle.", acknowledgement = ack-nhfb, articleno = "41", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Rayana:2016:LMB, author = "Shebuti Rayana and Leman Akoglu", title = "Less is More: Building Selective Anomaly Ensembles", journal = j-TKDD, volume = "10", number = "4", pages = "42:1--42:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2890508", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Ensemble learning for anomaly detection has been barely studied, due to difficulty in acquiring ground truth and the lack of inherent objective functions. In contrast, ensemble approaches for classification and clustering have been studied and effectively used for long. Our work taps into this gap and builds a new ensemble approach for anomaly detection, with application to event detection in temporal graphs as well as outlier detection in no-graph settings. It handles and combines multiple heterogeneous detectors to yield improved and robust performance. Importantly, trusting results from all the constituent detectors may deteriorate the overall performance of the ensemble, as some detectors could provide inaccurate results depending on the type of data in hand and the underlying assumptions of a detector. This suggests that combining the detectors selectively is key to building effective anomaly ensembles-hence ``less is more''. In this paper we propose a novel ensemble approach called SELECT for anomaly detection, which automatically and systematically selects the results from constituent detectors to combine in a fully unsupervised fashion. We apply our method to event detection in temporal graphs and outlier detection in multi-dimensional point data (no-graph), where SELECT successfully utilizes five base detectors and seven consensus methods under a unified ensemble framework. We provide extensive quantitative evaluation of our approach for event detection on five real-world datasets (four with ground truth events), including Enron email communications, RealityMining SMS and phone call records, New York Times news corpus, and World Cup 2014 Twitter news feed. We also provide results for outlier detection on seven real-world multi-dimensional point datasets from UCI Machine Learning Repository. Thanks to its selection mechanism, SELECT yields superior performance compared to the individual detectors alone, the full ensemble (naively combining all results), an existing diversity-based ensemble, and an existing weighted ensemble approach.", acknowledgement = ack-nhfb, articleno = "42", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhu:2016:CCS, author = "Yada Zhu and Jingrui He", title = "Co-Clustering Structural Temporal Data with Applications to Semiconductor Manufacturing", journal = j-TKDD, volume = "10", number = "4", pages = "43:1--43:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2875427", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Recent years have witnessed data explosion in semiconductor manufacturing due to advances in instrumentation and storage techniques. The large amount of data associated with process variables monitored over time form a rich reservoir of information, which can be used for a variety of purposes, such as anomaly detection, quality control, and fault diagnostics. In particular, following the same recipe for a certain Integrated Circuit device, multiple tools and chambers can be deployed for the production of this device, during which multiple time series can be collected, such as temperature, impedance, gas flow, electric bias, etc. These time series naturally fit into a two-dimensional array (matrix), i.e., each element in this array corresponds to a time series for one process variable from one chamber. To leverage the rich structural information in such temporal data, in this article, we propose a novel framework named C-Struts to simultaneously cluster on the two dimensions of this array. In this framework, we interpret the structural information as a set of constraints on the cluster membership, introduce an auxiliary probability distribution accordingly, and design an iterative algorithm to assign each time series to a certain cluster on each dimension. Furthermore, we establish the equivalence between C-Struts and a generic optimization problem, which is able to accommodate various distance functions. Extensive experiments on synthetic, benchmark, as well as manufacturing datasets demonstrate the effectiveness of the proposed method.", acknowledgement = ack-nhfb, articleno = "43", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Tahani:2016:IDD, author = "Maryam Tahani and Ali M. A. Hemmatyar and Hamid R. Rabiee and Maryam Ramezani", title = "Inferring Dynamic Diffusion Networks in Online Media", journal = j-TKDD, volume = "10", number = "4", pages = "44:1--44:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2882968", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Online media play an important role in information societies by providing a convenient infrastructure for different processes. Information diffusion that is a fundamental process taking place on social and information networks has been investigated in many studies. Research on information diffusion in these networks faces two main challenges: (1) In most cases, diffusion takes place on an underlying network, which is latent and its structure is unknown. (2) This latent network is not fixed and changes over time. In this article, we investigate the diffusion network extraction (DNE) problem when the underlying network is dynamic and latent. We model the diffusion behavior (existence probability) of each edge as a stochastic process and utilize the Hidden Markov Model (HMM) to discover the most probable diffusion links according to the current observation of the diffusion process, which is the infection time of nodes and the past diffusion behavior of links. We evaluate the performance of our Dynamic Diffusion Network Extraction (DDNE) method, on both synthetic and real datasets. Experimental results show that the performance of the proposed method is independent of the cascade transmission model and outperforms the state of art method in terms of F-measure.", acknowledgement = ack-nhfb, articleno = "44", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Koh:2016:URP, author = "Yun Sing Koh and Sri Devi Ravana", title = "Unsupervised Rare Pattern Mining: a Survey", journal = j-TKDD, volume = "10", number = "4", pages = "45:1--45:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2898359", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Association rule mining was first introduced to examine patterns among frequent items. The original motivation for seeking these rules arose from need to examine customer purchasing behaviour in supermarket transaction data. It seeks to identify combinations of items or itemsets, whose presence in a transaction affects the likelihood of the presence of another specific item or itemsets. In recent years, there has been an increasing demand for rare association rule mining. Detecting rare patterns in data is a vital task, with numerous high-impact applications including medical, finance, and security. This survey aims to provide a general, comprehensive, and structured overview of the state-of-the-art methods for rare pattern mining. We investigate the problems in finding rare rules using traditional association rule mining. As rare association rule mining has not been well explored, there is still specific groundwork that needs to be established. We will discuss some of the major issues in rare association rule mining and also look at current algorithms. As a contribution, we give a general framework for categorizing algorithms: Apriori and Tree based. We highlight the differences between these methods. Finally, we present several real-world application using rare pattern mining in diverse domains. We conclude our survey with a discussion on open and practical challenges in the field.", acknowledgement = ack-nhfb, articleno = "45", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Cheng:2016:CFR, author = "Wei Cheng and Zhishan Guo and Xiang Zhang and Wei Wang", title = "{CGC}: a Flexible and Robust Approach to Integrating Co-Regularized Multi-Domain Graph for Clustering", journal = j-TKDD, volume = "10", number = "4", pages = "46:1--46:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2903147", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Multi-view graph clustering aims to enhance clustering performance by integrating heterogeneous information collected in different domains. Each domain provides a different view of the data instances. Leveraging cross-domain information has been demonstrated an effective way to achieve better clustering results. Despite the previous success, existing multi-view graph clustering methods usually assume that different views are available for the same set of instances. Thus, instances in different domains can be treated as having strict one-to-one relationship. In many real-life applications, however, data instances in one domain may correspond to multiple instances in another domain. Moreover, relationships between instances in different domains may be associated with weights based on prior (partial) knowledge. In this article, we propose a flexible and robust framework, Co-regularized Graph Clustering (CGC), based on non-negative matrix factorization (NMF), to tackle these challenges. CGC has several advantages over the existing methods. First, it supports many-to-many cross-domain instance relationship. Second, it incorporates weight on cross-domain relationship. Third, it allows partial cross-domain mapping so that graphs in different domains may have different sizes. Finally, it provides users with the extent to which the cross-domain instance relationship violates the in-domain clustering structure, and thus enables users to re-evaluate the consistency of the relationship. We develop an efficient optimization method that guarantees to find the global optimal solution with a given confidence requirement. The proposed method can automatically identify noisy domains and assign smaller weights to them. This helps to obtain optimal graph partition for the focused domain. Extensive experimental results on UCI benchmark datasets, newsgroup datasets, and biological interaction networks demonstrate the effectiveness of our approach.", acknowledgement = ack-nhfb, articleno = "46", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Shen:2016:SPO, author = "Chih-Ya Shen and De-Nian Yang and Wang-Chien Lee and Ming-Syan Chen", title = "Spatial-Proximity Optimization for Rapid Task Group Deployment", journal = j-TKDD, volume = "10", number = "4", pages = "47:1--47:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2818714", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Spatial proximity is one of the most important factors for the quick deployment of the task groups in various time-sensitive missions. This article proposes a new spatial query, Spatio-Social Team Query (SSTQ), that forms a strong task group by considering (1) the group's spatial distance (i.e., transportation time), (2) skills of the candidate group members, and (3) social rapport among the candidates. Efficient processing of SSTQ is very challenging, because the aforementioned spatial, skill, and social factors need to be carefully examined. In this article, therefore, we first formulate two subproblems of SSTQ, namely Hop-Constrained Team Problem (HCTP) and Connection-Oriented Team Query (COTQ). HCTP is a decision problem that considers only social and skill dimensions. We prove that HCTP is NP-Complete. Moreover, based on the hardness of HCTP, we prove that SSTQ is NP-Hard and inapproximable within any factor. On the other hand, COTQ is a special case of SSTQ that relaxes the social constraint. We prove that COTQ is NP-Hard and propose an approximation algorithm for COTQ, namely COTprox. Furthermore, based on the observations on COTprox, we devise an approximation algorithm, SSTprox, with a guaranteed error bound for SSTQ. Finally, to efficiently obtain the optimal solution to SSTQ for small instances, we design two efficient algorithms, SpatialFirst and SkillFirst, with different scenarios in mind. These two algorithms incorporate various effective ordering and pruning techniques to reduce the search space for answering SSTQ. Experimental results on real datasets indicate that the proposed algorithms can efficiently answer SSTQ under various parameter settings.", acknowledgement = ack-nhfb, articleno = "47", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2016:FDV, author = "Zhiwen Yu and Zhitao Wang and Liming Chen and Bin Guo and Wenjie Li", title = "Featuring, Detecting, and Visualizing Human Sentiment in {Chinese} Micro-Blog", journal = j-TKDD, volume = "10", number = "4", pages = "48:1--48:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2821513", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Micro-blog has been increasingly used for the public to express their opinions, and for organizations to detect public sentiment about social events or public policies. In this article, we examine and identify the key problems of this field, focusing particularly on the characteristics of innovative words, multi-media elements, and hierarchical structure of Chinese ``Weibo.'' Based on the analysis, we propose a novel approach and develop associated theoretical and technological methods to address these problems. These include a new sentiment word mining method based on three wording metrics and point-wise information, a rule set model for analyzing sentiment features of different linguistic components, and the corresponding methodology for calculating sentiment on multi-granularity considering emoticon elements as auxiliary affective factors. We evaluate our new word discovery and sentiment detection methods on a real-life Chinese micro-blog dataset. Initial results show that our new diction can improve sentiment detection, and they demonstrate that our multi-level rule set method is more effective, with the average accuracy being 10.2\% and 1.5\% higher than two existing methods for Chinese micro-blog sentiment analysis. In addition, we exploit visualization techniques to study the relationships between online sentiment and real life. The visualization of detected sentiment can help depict temporal patterns and spatial discrepancy.", acknowledgement = ack-nhfb, articleno = "48", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chen:2016:EOL, author = "Chen Chen and Hanghang Tong and B. Aditya Prakash and Tina Eliassi-Rad and Michalis Faloutsos and Christos Faloutsos", title = "Eigen-Optimization on Large Graphs by Edge Manipulation", journal = j-TKDD, volume = "10", number = "4", pages = "49:1--49:??", month = jul, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2903148", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:29 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Large graphs are prevalent in many applications and enable a variety of information dissemination processes, e.g., meme, virus, and influence propagation. How can we optimize the underlying graph structure to affect the outcome of such dissemination processes in a desired way (e.g., stop a virus propagation, facilitate the propagation of a piece of good idea, etc)? Existing research suggests that the leading eigenvalue of the underlying graph is the key metric in determining the so-called epidemic threshold for a variety of dissemination models. In this paper, we study the problem of how to optimally place a set of edges (e.g., edge deletion and edge addition) to optimize the leading eigenvalue of the underlying graph, so that we can guide the dissemination process in a desired way. We propose effective, scalable algorithms for edge deletion and edge addition, respectively. In addition, we reveal the intrinsic relationship between edge deletion and node deletion problems. Experimental results validate the effectiveness and efficiency of the proposed algorithms.", acknowledgement = ack-nhfb, articleno = "49", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yu:2016:STR, author = "Zhiwen Yu and Miao Tian and Zhu Wang and Bin Guo and Tao Mei", title = "Shop-Type Recommendation Leveraging the Data from Social Media and Location-Based Services", journal = j-TKDD, volume = "11", number = "1", pages = "1:1--1:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2930671", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "It is an important yet challenging task for investors to determine the most suitable type of shop (e.g., restaurant, fashion) for a newly opened store. Traditional ways are predominantly field surveys and empirical estimation, which are not effective as they lack shop-related data. As social media and location-based services (LBS) are becoming more and more pervasive, user-generated data from these platforms are providing rich information not only about individual consumption experiences, but also about shop attributes. In this paper, we investigate the recommendation of shop types for a given location, by leveraging heterogeneous data that are mainly historical user preferences and location context from social media and LBS. Our goal is to select the most suitable shop type, seeking to maximize the number of customers served from a candidate set of types. We propose a novel bias learning matrix factorization method with feature fusion for shop popularity prediction. Features are defined and extracted from two perspectives: location, where features are closely related to location characteristics, and commercial, where features are about the relationships between shops in the neighborhood. Experimental results show that the proposed method outperforms state-of-the-art solutions.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{McDowell:2016:LNA, author = "Luke K. McDowell and David W. Aha", title = "Leveraging Neighbor Attributes for Classification in Sparsely Labeled Networks", journal = j-TKDD, volume = "11", number = "1", pages = "2:1--2:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2898358", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Many analysis tasks involve linked nodes, such as people connected by friendship links. Research on link-based classification (LBC) has studied how to leverage these connections to improve classification accuracy. Most such prior research has assumed the provision of a densely labeled training network. Instead, this article studies the common and challenging case when LBC must use a single sparsely labeled network for both learning and inference, a case where existing methods often yield poor accuracy. To address this challenge, we introduce a novel method that enables prediction via ``neighbor attributes,'' which were briefly considered by early LBC work but then abandoned due to perceived problems. We then explain, using both extensive experiments and loss decomposition analysis, how using neighbor attributes often significantly improves accuracy. We further show that using appropriate semi-supervised learning (SSL) is essential to obtaining the best accuracy in this domain and that the gains of neighbor attributes remain across a range of SSL choices and data conditions. Finally, given the challenges of label sparsity for LBC and the impact of neighbor attributes, we show that multiple previous studies must be re-considered, including studies regarding the best model features, the impact of noisy attributes, and strategies for active learning.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Chang:2016:CSP, author = "Xiaojun Chang and Feiping Nie and Yi Yang and Chengqi Zhang and Heng Huang", title = "Convex Sparse {PCA} for Unsupervised Feature Learning", journal = j-TKDD, volume = "11", number = "1", pages = "3:1--3:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2910585", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Principal component analysis (PCA) has been widely applied to dimensionality reduction and data pre-processing for different applications in engineering, biology, social science, and the like. Classical PCA and its variants seek for linear projections of the original variables to obtain the low-dimensional feature representations with maximal variance. One limitation is that it is difficult to interpret the results of PCA. Besides, the classical PCA is vulnerable to certain noisy data. In this paper, we propose a Convex Sparse Principal Component Analysis (CSPCA) algorithm and apply it to feature learning. First, we show that PCA can be formulated as a low-rank regression optimization problem. Based on the discussion, the $l_{2, 1}$-norm minimization is incorporated into the objective function to make the regression coefficients sparse, thereby robust to the outliers. Also, based on the sparse model used in CSPCA, an optimal weight is assigned to each of the original feature, which in turn provides the output with good interpretability. With the output of our CSPCA, we can effectively analyze the importance of each feature under the PCA criteria. Our new objective function is convex, and we propose an iterative algorithm to optimize it. We apply the CSPCA algorithm to feature selection and conduct extensive experiments on seven benchmark datasets. Experimental results demonstrate that the proposed algorithm outperforms state-of-the-art unsupervised feature selection algorithms.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Wu:2016:LLR, author = "Ou Wu and Qiang You and Fen Xia and Lei Ma and Weiming Hu", title = "Listwise Learning to Rank from Crowds", journal = j-TKDD, volume = "11", number = "1", pages = "4:1--4:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2910586", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Learning to rank has received great attention in recent years as it plays a crucial role in many applications such as information retrieval and data mining. The existing concept of learning to rank assumes that each training instance is associated with a reliable label. However, in practice, this assumption does not necessarily hold true as it may be infeasible or remarkably expensive to obtain reliable labels for many learning to rank applications. Therefore, a feasible approach is to collect labels from crowds and then learn a ranking function from crowdsourcing labels. This study explores the listwise learning to rank with crowdsourcing labels obtained from multiple annotators, who may be unreliable. A new probabilistic ranking model is first proposed by combining two existing models. Subsequently, a ranking function is trained by proposing a maximum likelihood learning approach, which estimates ground-truth labels and annotator expertise, and trains the ranking function iteratively. In practical crowdsourcing machine learning, valuable side information (e.g., professional grades) about involved annotators is normally attainable. Therefore, this study also investigates learning to rank from crowd labels when side information on the expertise of involved annotators is available. In particular, three basic types of side information are investigated, and corresponding learning algorithms are consequently introduced. Further, the top-k learning to rank from crowdsourcing labels are explored to deal with long training ranking lists. The proposed algorithms are tested on both synthetic and real-world data. Results reveal that the maximum likelihood estimation approach significantly outperforms the average approach and existing crowdsourcing regression methods. The performances of the proposed algorithms are comparable to those of the learning model in consideration reliable labels. The results of the investigation further indicate that side information is helpful in inferring both ranking functions and expertise degrees of annotators.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Shao:2016:SCI, author = "Junming Shao and Qinli Yang and Hoang-Vu Dang and Bertil Schmidt and Stefan Kramer", title = "Scalable Clustering by Iterative Partitioning and Point Attractor Representation", journal = j-TKDD, volume = "11", number = "1", pages = "5:1--5:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2934688", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Clustering very large datasets while preserving cluster quality remains a challenging data-mining task to date. In this paper, we propose an effective scalable clustering algorithm for large datasets that builds upon the concept of synchronization. Inherited from the powerful concept of synchronization, the proposed algorithm, CIPA (Clustering by Iterative Partitioning and Point Attractor Representations), is capable of handling very large datasets by iteratively partitioning them into thousands of subsets and clustering each subset separately. Using dynamic clustering by synchronization, each subset is then represented by a set of point attractors and outliers. Finally, CIPA identifies the cluster structure of the original dataset by clustering the newly generated dataset consisting of points attractors and outliers from all subsets. We demonstrate that our new scalable clustering approach has several attractive benefits: (a) CIPA faithfully captures the cluster structure of the original data by performing clustering on each separate data iteratively instead of using any sampling or statistical summarization technique. (b) It allows clustering very large datasets efficiently with high cluster quality. (c) CIPA is parallelizable and also suitable for distributed data. Extensive experiments demonstrate the effectiveness and efficiency of our approach.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Grabocka:2016:LTS, author = "Josif Grabocka and Nicolas Schilling and Lars Schmidt-Thieme", title = "Latent Time-Series Motifs", journal = j-TKDD, volume = "11", number = "1", pages = "6:1--6:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2940329", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Motifs are the most repetitive/frequent patterns of a time-series. The discovery of motifs is crucial for practitioners in order to understand and interpret the phenomena occurring in sequential data. Currently, motifs are searched among series sub-sequences, aiming at selecting the most frequently occurring ones. Search-based methods, which try out series sub-sequence as motif candidates, are currently believed to be the best methods in finding the most frequent patterns. However, this paper proposes an entirely new perspective in finding motifs. We demonstrate that searching is non-optimal since the domain of motifs is restricted, and instead we propose a principled optimization approach able to find optimal motifs. We treat the occurrence frequency as a function and time-series motifs as its parameters, therefore we learn the optimal motifs that maximize the frequency function. In contrast to searching, our method is able to discover the most repetitive patterns (hence optimal), even in cases where they do not explicitly occur as sub-sequences. Experiments on several real-life time-series datasets show that the motifs found by our method are highly more frequent than the ones found through searching, for exactly the same distance threshold.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Zhang:2016:SNE, author = "Xianchao Zhang and Linlin Zong and Quanzeng You and Xing Yong", title = "Sampling for {Nystr{\"o}m} Extension-Based Spectral Clustering: Incremental Perspective and Novel Analysis", journal = j-TKDD, volume = "11", number = "1", pages = "7:1--7:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2934693", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Sampling is the key aspect for Nystr{\"o}m extension based spectral clustering. Traditional sampling schemes select the set of landmark points on a whole and focus on how to lower the matrix approximation error. However, the matrix approximation error does not have direct impact on the clustering performance. In this article, we propose a sampling framework from an incremental perspective, i.e., the landmark points are selected one by one, and each next point to be sampled is determined by previously selected landmark points. Incremental sampling builds explicit relationships among landmark points; thus, they work together well and provide a theoretical guarantee on the clustering performance. We provide two novel analysis methods and propose two schemes for selecting-the-next-one of the framework. The first scheme is based on clusterability analysis, which provides a better guarantee on clustering performance than schemes based on matrix approximation error analysis. The second scheme is based on loss analysis, which provides maximized predictive ability of the landmark points on the (implicit) labels of the unsampled points. Experimental results on a wide range of benchmark datasets demonstrate the superiorities of our proposed incremental sampling schemes over existing sampling schemes.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Qiao:2016:FST, author = "Maoying Qiao and Richard Yi Da Xu and Wei Bian and Dacheng Tao", title = "Fast Sampling for Time-Varying Determinantal Point Processes", journal = j-TKDD, volume = "11", number = "1", pages = "8:1--8:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2943785", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Determinantal Point Processes (DPPs) are stochastic models which assign each subset of a base dataset with a probability proportional to the subset's degree of diversity. It has been shown that DPPs are particularly appropriate in data subset selection and summarization (e.g., news display, video summarizations). DPPs prefer diverse subsets while other conventional models cannot offer. However, DPPs inference algorithms have a polynomial time complexity which makes it difficult to handle large and time-varying datasets, especially when real-time processing is required. To address this limitation, we developed a fast sampling algorithm for DPPs which takes advantage of the nature of some time-varying data (e.g., news corpora updating, communication network evolving), where the data changes between time stamps are relatively small. The proposed algorithm is built upon the simplification of marginal density functions over successive time stamps and the sequential Monte Carlo (SMC) sampling technique. Evaluations on both a real-world news dataset and the Enron Corpus confirm the efficiency of the proposed algorithm.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Crescenzi:2016:GIO, author = "Pierluigi Crescenzi and Gianlorenzo D'angelo and Lorenzo Severini and Yllka Velaj", title = "Greedily Improving Our Own Closeness Centrality in a Network", journal = j-TKDD, volume = "11", number = "1", pages = "9:1--9:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2953882", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "The closeness centrality is a well-known measure of importance of a vertex within a given complex network. Having high closeness centrality can have positive impact on the vertex itself: hence, in this paper we consider the optimization problem of determining how much a vertex can increase its centrality by creating a limited amount of new edges incident to it. We will consider both the undirected and the directed graph cases. In both cases, we first prove that the optimization problem does not admit a polynomial-time approximation scheme (unless P = NP ), and then propose a greedy approximation algorithm (with an almost tight approximation ratio), whose performance is then tested on synthetic graphs and real-world networks.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Li:2016:CBN, author = "Xiang Li and Charles X. Ling and Huaimin Wang", title = "The Convergence Behavior of Naive {Bayes} on Large Sparse Datasets", journal = j-TKDD, volume = "11", number = "1", pages = "10:1--10:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2948068", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "Large and sparse datasets with a lot of missing values are common in the big data era, such as user behaviors over a large number of items. Classification in such datasets is an important topic for machine learning and data mining. Practically, naive Bayes is still a popular classification algorithm for large sparse datasets, as its time and space complexity scales linearly with the size of non-missing values. However, several important questions about the behavior of naive Bayes are yet to be answered. For example, how different mechanisms of data missing, data sparsity, and the number of attributes systematically affect the learning curves and convergence? In this paper, we address several common data missing mechanisms and propose novel data generation methods based on these mechanisms. We generate large and sparse data systematically, and study the entire AUC (Area Under ROC Curve) learning curve and convergence behavior of naive Bayes. We not only have several important experiment observations, but also provide detailed theoretic studies. Finally, we summarize our empirical and theoretic results as an intuitive decision flowchart and a useful guideline for classifying large sparse datasets in practice.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Fu:2016:MGD, author = "Yanjie Fu and Hui Xiong and Yong Ge and Yu Zheng and Zijun Yao and Zhi-Hua Zhou", title = "Modeling of Geographic Dependencies for Real Estate Ranking", journal = j-TKDD, volume = "11", number = "1", pages = "11:1--11:??", month = aug, year = "2016", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2934692", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Mon Aug 29 07:28:30 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib", abstract = "It is traditionally a challenge for home buyers to understand, compare, and contrast the investment value of real estate. Although a number of appraisal methods have been developed to value real properties, the performances of these methods have been limited by traditional data sources for real estate appraisal. With the development of new ways of collecting estate-related mobile data, there is a potential to leverage geographic dependencies of real estate for enhancing real estate appraisal. Indeed, the geographic dependencies of the investment value of an estate can be from the characteristics of its own neighborhood (individual), the values of its nearby estates (peer), and the prosperity of the affiliated latent business area (zone). To this end, in this paper, we propose a geographic method, named ClusRanking, for real estate appraisal by leveraging the mutual enforcement of ranking and clustering power. ClusRanking is able to exploit geographic individual, peer, and zone dependencies in a probabilistic ranking model. Specifically, we first extract the geographic utility of estates from geography data, estimate the neighborhood popularity of estates by mining taxicab trajectory data, and model the influence of latent business areas. Also, we fuse these three influential factors and predict real estate investment value. Moreover, we simultaneously consider individual, peer and zone dependencies, and derive an estate-specific ranking likelihood as the objective function. Furthermore, we propose an improved method named CR-ClusRanking by incorporating checkin information as a regularization term which reduces the performance volatility of real estate ranking system. Finally, we conduct a comprehensive evaluation with the real estate-related data of Beijing, and the experimental results demonstrate the effectiveness of our proposed methods.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", }