%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.17",
%%%     date            = "10 October 2014",
%%%     time            = "17:20:40 MDT",
%%%     filename        = "tkdd.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "53215 8305 45949 438953",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Knowledge Discovery from
%%%                        Data (TKDD); bibliography; TKDD",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Knowledge Discovery from
%%%                        Data (TKDD) (CODEN ????, ISSN 1556-4681),
%%%                        covering all journal issues from 2007 --
%%%                        date.
%%%
%%%                        At version 1.17, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2007 (  14)    2010 (  26)    2013 (  20)
%%%                             2008 (  18)    2011 (  11)    2014 (  37)
%%%                             2009 (  25)    2012 (  26)
%%%
%%%                             Article:        177
%%%
%%%                             Total entries:  177
%%%
%%%                        The journal Web page can be found at:
%%%
%%%                            http://www.acm.org/pubs/tkdd.html
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://www.acm.org/tkdd/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J1054
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================

@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-TKDD                  = "ACM Transactions on Knowledge
                                  Discovery from Data (TKDD)"}

%%% ====================================================================
%%% Bibliography entries:

@Article{Han:2007:I,
  author =       "Jiawei Han",
  title =        "Introduction",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1217299.1217300",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Leskovec:2007:GED,
  author =       "Jure Leskovec and Jon Kleinberg and Christos
                 Faloutsos",
  title =        "Graph evolution: {Densification} and shrinking
                 diameters",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1217299.1217301",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How do real graphs evolve over time? What are normal
                 growth patterns in social, technological, and
                 information networks? Many studies have discovered
                 patterns in {\em static graphs}, identifying properties
                 in a single snapshot of a large network or in a very
                 small number of snapshots; these include heavy tails
                 for in- and out-degree distributions, communities,
                 small-world phenomena, and others. However, given the
                 lack of information about network evolution over long
                 periods, it has been hard to convert these findings
                 into statements about trends over time.\par

                 Here we study a wide range of real graphs, and we
                 observe some surprising phenomena. First, most of these
                 graphs densify over time with the number of edges
                 growing superlinearly in the number of nodes. Second,
                 the average distance between nodes often shrinks over
                 time in contrast to the conventional wisdom that such
                 distance parameters should increase slowly as a
                 function of the number of nodes (like $O(\log n)$ or
                 $O(\log(\log n))$).\par

                 Existing graph generation models do not exhibit these
                 types of behavior even at a qualitative level. We
                 provide a new graph generator, based on a forest fire
                 spreading process that has a simple, intuitive
                 justification, requires very few parameters (like the
                 flammability of nodes), and produces graphs exhibiting
                 the full range of properties observed both in prior
                 work and in the present study.\par

                 We also notice that the forest fire model exhibits a
                 sharp transition between sparse graphs and graphs that
                 are densifying. Graphs with decreasing distance between
                 the nodes are generated around this transition
                 point.\par

                 Last, we analyze the connection between the temporal
                 evolution of the degree distribution and densification
                 of a graph. We find that the two are fundamentally
                 related. We also observe that real networks exhibit
                 this type of relation between densification and the
                 degree distribution.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Densification power laws; graph generators; graph
                 mining; heavy-tailed distributions; small-world
                 phenomena",
}

@Article{Machanavajjhala:2007:DPB,
  author =       "Ashwin Machanavajjhala and Daniel Kifer and Johannes
                 Gehrke and Muthuramakrishnan Venkitasubramaniam",
  title =        "{{$L$}}-diversity: {Privacy} beyond $k$-anonymity",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1217299.1217302",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Publishing data about individuals without revealing
                 sensitive information about them is an important
                 problem. In recent years, a new definition of privacy
                 called $k$-anonymity has gained popularity. In a
                 $k$-anonymized dataset, each record is
                 indistinguishable from at least $k - 1$ other records
                 with respect to certain identifying attributes.\par

                 In this article, we show using two simple attacks that
                 a $k$-anonymized dataset has some subtle but severe
                 privacy problems. First, an attacker can discover the
                 values of sensitive attributes when there is little
                 diversity in those sensitive attributes. This is a
                 known problem. Second, attackers often have background
                 knowledge, and we show that $k$-anonymity does not
                 guarantee privacy against attackers using background
                 knowledge. We give a detailed analysis of these two
                 attacks, and we propose a novel and powerful privacy
                 criterion called $\ell$-diversity that can defend
                 against such attacks. In addition to building a formal
                 foundation for $\ell$-diversity, we show in an
                 experimental evaluation that $\ell$-diversity is
                 practical and can be implemented efficiently.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "-diversity; Data privacy; ell-k-anonymity;
                 privacy-preserving data publishing",
}

@Article{Gionis:2007:CA,
  author =       "Aristides Gionis and Heikki Mannila and Panayiotis
                 Tsaparas",
  title =        "Clustering aggregation",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1217299.1217303",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We consider the following problem: given a set of
                 clusterings, find a single clustering that agrees as
                 much as possible with the input clusterings. This
                 problem, {\em clustering aggregation}, appears
                 naturally in various contexts. For example, clustering
                 categorical data is an instance of the clustering
                 aggregation problem; each categorical attribute can be
                 viewed as a clustering of the input rows where rows are
                 grouped together if they take the same value on that
                 attribute. Clustering aggregation can also be used as a
                 metaclustering method to improve the robustness of
                 clustering by combining the output of multiple
                 algorithms. Furthermore, the problem formulation does
                 not require a priori information about the number of
                 clusters; it is naturally determined by the
                 optimization function.\par

                 In this article, we give a formal statement of the
                 clustering aggregation problem, and we propose a number
                 of algorithms. Our algorithms make use of the
                 connection between clustering aggregation and the
                 problem of {\em correlation clustering}. Although the
                 problems we consider are NP-hard, for several of our
                 methods, we provide theoretical guarantees on the
                 quality of the solutions. Our work provides the best
                 deterministic approximation algorithm for the variation
                 of the correlation clustering problem we consider. We
                 also show how sampling can be used to scale the
                 algorithms for large datasets. We give an extensive
                 empirical evaluation demonstrating the usefulness of
                 the problem and of the solutions.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "clustering aggregation; clustering categorical data;
                 correlation clustering; Data clustering",
}

@Article{Bhattacharya:2007:CER,
  author =       "Indrajit Bhattacharya and Lise Getoor",
  title =        "Collective entity resolution in relational data",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1217299.1217304",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Many databases contain uncertain and imprecise
                 references to real-world entities. The absence of
                 identifiers for the underlying entities often results
                 in a database which contains multiple references to the
                 same entity. This can lead not only to data redundancy,
                 but also inaccuracies in query processing and knowledge
                 extraction. These problems can be alleviated through
                 the use of {\em entity resolution}. Entity resolution
                 involves discovering the underlying entities and
                 mapping each database reference to these entities.
                 Traditionally, entities are resolved using pairwise
                 similarity over the attributes of references. However,
                 there is often additional relational information in the
                 data. Specifically, references to different entities
                 may cooccur. In these cases, collective entity
                 resolution, in which entities for cooccurring
                 references are determined jointly rather than
                 independently, can improve entity resolution accuracy.
                 We propose a novel relational clustering algorithm that
                 uses both attribute and relational information for
                 determining the underlying domain entities, and we give
                 an efficient implementation. We investigate the impact
                 that different relational similarity measures have on
                 entity resolution quality. We evaluate our collective
                 entity resolution algorithm on multiple real-world
                 databases. We show that it improves entity resolution
                 performance over both attribute-based baselines and
                 over algorithms that consider relational information
                 but do not resolve entities collectively. In addition,
                 we perform detailed experiments on synthetically
                 generated data to identify data characteristics that
                 favor collective relational resolution over purely
                 attribute-based algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "data cleaning; Entity resolution; graph clustering;
                 record linkage",
}

@Article{Loh:2007:EEL,
  author =       "Wei-Yin Loh and Chien-Wei Chen and Wei Zheng",
  title =        "Extrapolation errors in linear model trees",
  journal =      j-TKDD,
  volume =       "1",
  number =       "2",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1267066.1267067",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:48 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Prediction errors from a linear model tend to be
                 larger when extrapolation is involved, particularly
                 when the model is wrong. This article considers the
                 problem of extrapolation and interpolation errors when
                 a linear model tree is used for prediction. It proposes
                 several ways to curtail the size of the errors, and
                 uses a large collection of real datasets to demonstrate
                 that the solutions are effective in reducing the
                 average mean squared prediction error. The article also
                 provides a proof that, if a linear model is correct,
                 the proposed solutions have no undesirable effects as
                 the training sample size tends to infinity.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Decision tree; prediction; regression; statistics",
}

@Article{Zhang:2007:MPP,
  author =       "Minghua Zhang and Ben Kao and David W. Cheung and
                 Kevin Y. Yip",
  title =        "Mining periodic patterns with gap requirement from
                 sequences",
  journal =      j-TKDD,
  volume =       "1",
  number =       "2",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1267066.1267068",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:48 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We study a problem of mining frequently occurring
                 periodic patterns with a gap requirement from
                 sequences. Given a character sequence $S$ of length $L$
                 and a pattern $P$ of length $l$, we consider $P$ a
                 frequently occurring pattern in $S$ if the probability
                 of {\em observing\/} $P$ given a randomly picked
                 length-$l$ subsequence of $S$ exceeds a certain
                 threshold. In many applications, particularly those
                 related to bioinformatics, interesting patterns are
                 {\em periodic\/} with a {\em gap requirement}. That is
                 to say, the characters in $P$ should match subsequences
                 of $S$ in such a way that the matching characters in
                 $S$ are separated by gaps of more or less the same
                 size. We show the complexity of the mining problem and
                 discuss why traditional mining algorithms are
                 computationally infeasible. We propose practical
                 algorithms for solving the problem and study their
                 characteristics. We also present a case study in which
                 we apply our algorithms on some DNA sequences. We
                 discuss some interesting patterns obtained from the
                 case study.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "gap requirement; periodic pattern; Sequence mining",
}

@Article{Huang:2007:TTE,
  author =       "Jen-Wei Huang and Bi-Ru Dai and Ming-Syan Chen",
  title =        "{Twain}: {Two-end} association miner with precise
                 frequent exhibition periods",
  journal =      j-TKDD,
  volume =       "1",
  number =       "2",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1267066.1267069",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:48 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We investigate the general model of mining
                 associations in a temporal database, where the
                 exhibition periods of items are allowed to be different
                 from one to another. The database is divided into
                 partitions according to the time granularity imposed.
                 Such temporal association rules allow us to observe
                 short-term but interesting patterns that are absent
                 when the whole range of the database is evaluated
                 altogether. Prior work may omit some temporal
                 association rules and thus have limited practicability.
                 To remedy this and to give more precise frequent
                 exhibition periods of frequent temporal itemsets, we
                 devise an efficient algorithm {\em Twain\/} (standing
                 for {\em TWo end AssocIation miNer\/} .) {\em Twain\/}
                 not only generates frequent patterns with more precise
                 frequent exhibition periods, but also discovers more
                 interesting frequent patterns. {\em Twain\/} employs
                 Start time and End time of each item to provide precise
                 frequent exhibition period while progressively handling
                 itemsets from one partition to another. Along with one
                 scan of the database, {\em Twain\/} can generate
                 frequent 2-itemsets directly according to the
                 cumulative filtering threshold. Then, {\em Twain\/}
                 adopts the scan reduction technique to generate all
                 frequent $k$-itemsets ($k$ > 2) from the generated
                 frequent 2-itemsets. Theoretical properties of {\em
                 Twain\/} are derived as well in this article. The
                 experimental results show that {\em Twain\/}
                 outperforms the prior works in the quality of frequent
                 patterns, execution time, I/O cost, CPU overhead and
                 scalability.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Association; temporal",
}

@Article{Bayardop:2007:ISI,
  author =       "Roberto Bayardop and Kristin P. Bennett and Gautam Das
                 and Dimitrios Gunopulos and Johannes Gunopulos",
  title =        "Introduction to special issue {ACM SIGKDD 2006}",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1297332.1297333",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Bohm:2007:RPF,
  author =       "Christian B{\"o}hm and Christos Faloutsos and Jia-Yu
                 Pan and Claudia Plant",
  title =        "{RIC}: {Parameter-free} noise-robust clustering",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1297332.1297334",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How do we find a {\em natural\/} clustering of a
                 real-world point set which contains an unknown number
                 of clusters with different shapes, and which may be
                 contaminated by noise? As most clustering algorithms
                 were designed with certain assumptions (Gaussianity),
                 they often require the user to give input parameters,
                 and are sensitive to noise. In this article, we propose
                 a robust framework for determining a natural clustering
                 of a given dataset, based on the minimum description
                 length (MDL) principle. The proposed framework, {\em
                 robust information-theoretic clustering (RIC)}, is
                 orthogonal to any known clustering algorithm: Given a
                 preliminary clustering, RIC purifies these clusters
                 from noise, and adjusts the clusterings such that it
                 simultaneously determines the most natural amount and
                 shape (subspace) of the clusters. Our RIC method can be
                 combined with any clustering technique ranging from
                 K-means and K-medoids to advanced methods such as
                 spectral clustering. In fact, RIC is even able to
                 purify and improve an initial coarse clustering, even
                 if we start with very simple methods. In an extension,
                 we propose a fully automatic stand-alone clustering
                 method and efficiency improvements. RIC scales well
                 with the dataset size. Extensive experiments on
                 synthetic and real-world datasets validate the proposed
                 RIC framework.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Clustering; data summarization; noise robustness;
                 parameter-free data mining",
}

@Article{Mei:2007:SAF,
  author =       "Qiaozhu Mei and Dong Xin and Hong Cheng and Jiawei Han
                 and Chengxiang Zhai",
  title =        "Semantic annotation of frequent patterns",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1297332.1297335",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Using frequent patterns to analyze data has been one
                 of the fundamental approaches in many data mining
                 applications. Research in frequent pattern mining has
                 so far mostly focused on developing efficient
                 algorithms to discover various kinds of frequent
                 patterns, but little attention has been paid to the
                 important next step --- interpreting the discovered
                 frequent patterns. Although the compression and
                 summarization of frequent patterns has been studied in
                 some recent work, the proposed techniques there can
                 only annotate a frequent pattern with nonsemantical
                 information (e.g., support), which provides only
                 limited help for a user to understand the
                 patterns.\par

                 In this article, we study the novel problem of
                 generating semantic annotations for frequent patterns.
                 The goal is to discover the hidden meanings of a
                 frequent pattern by annotating it with in-depth,
                 concise, and structured information. We propose a
                 general approach to generate such an annotation for a
                 frequent pattern by constructing its context model,
                 selecting informative context indicators, and
                 extracting representative transactions and semantically
                 similar patterns. This general approach can well
                 incorporate the user's prior knowledge, and has
                 potentially many applications, such as generating a
                 dictionary-like description for a pattern, finding
                 synonym patterns, discovering semantic relations, and
                 summarizing semantic classes of a set of frequent
                 patterns. Experiments on different datasets show that
                 our approach is effective in generating semantic
                 pattern annotations.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Frequent pattern; pattern annotation; pattern context;
                 pattern semantic analysis",
}

@Article{Koren:2007:MEP,
  author =       "Yehuda Koren and Stephen C. North and Chris Volinsky",
  title =        "Measuring and extracting proximity graphs in
                 networks",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1297332.1297336",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Measuring distance or some other form of proximity
                 between objects is a standard data mining tool.
                 Connection subgraphs were recently proposed as a way to
                 demonstrate proximity between nodes in networks. We
                 propose a new way of measuring and extracting proximity
                 in networks called ``cycle-free effective conductance''
                 (CFEC). Importantly, the measured proximity is
                 accompanied with a {\em proximity subgraph\/} which
                 allows assessing and understanding measured values. Our
                 proximity calculation can handle more than two
                 endpoints, directed edges, is statistically well
                 behaved, and produces an effectiveness score for the
                 computed subgraphs. We provide an efficient algorithm
                 to measure and extract proximity. Also, we report
                 experimental results and show examples for four large
                 network datasets: a telecommunications calling graph,
                 the IMDB actors graph, an academic coauthorship
                 network, and a movie recommendation system.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Connection subgraph; cycle-free escape probability;
                 escape probability; graph mining; proximity; proximity
                 subgraph; random walk",
}

@Article{Ihler:2007:LDE,
  author =       "Alexander Ihler and Jon Hutchins and Padhraic Smyth",
  title =        "Learning to detect events with {Markov}-modulated
                 {Poisson} processes",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1297332.1297337",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Time-series of count data occur in many different
                 contexts, including Internet navigation logs, freeway
                 traffic monitoring, and security logs associated with
                 buildings. In this article we describe a framework for
                 detecting anomalous events in such data using an
                 unsupervised learning approach. Normal periodic
                 behavior is modeled via a time-varying Poisson process
                 model, which in turn is modulated by a hidden Markov
                 process that accounts for bursty events. We outline a
                 Bayesian framework for learning the parameters of this
                 model from count time-series. Two large real-world
                 datasets of time-series counts are used as testbeds to
                 validate the approach, consisting of freeway traffic
                 data and logs of people entering and exiting a
                 building. We show that the proposed model is
                 significantly more accurate at detecting known events
                 than a more traditional threshold-based technique. We
                 also describe how the model can be used to investigate
                 different degrees of periodicity in the data, including
                 systematic day-of-week and time-of-day effects, and to
                 make inferences about different aspects of events such
                 as number of vehicles or people involved. The results
                 indicate that the Markov-modulated Poisson framework
                 provides a robust and accurate framework for adaptively
                 and autonomously learning how to separate unusual
                 bursty events from traces of normal human activity.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Event detection; Markov modulated; Poisson",
}

@Article{Gionis:2007:ADM,
  author =       "Aristides Gionis and Heikki Mannila and Taneli
                 Mielik{\"a}inen and Panayiotis Tsaparas",
  title =        "Assessing data mining results via swap randomization",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1297332.1297338",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The problem of assessing the significance of data
                 mining results on high-dimensional 0--1 datasets has
                 been studied extensively in the literature. For
                 problems such as mining frequent sets and finding
                 correlations, significance testing can be done by
                 standard statistical tests such as chi-square, or other
                 methods. However, the results of such tests depend only
                 on the specific attributes and not on the dataset as a
                 whole. Moreover, the tests are difficult to apply to
                 sets of patterns or other complex results of data
                 mining algorithms. In this article, we consider a
                 simple randomization technique that deals with this
                 shortcoming. The approach consists of producing random
                 datasets that have the same row and column margins as
                 the given dataset, computing the results of interest on
                 the randomized instances and comparing them to the
                 results on the actual data. This randomization
                 technique can be used to assess the results of many
                 different types of data mining algorithms, such as
                 frequent sets, clustering, and spectral analysis. To
                 generate random datasets with given margins, we use
                 variations of a Markov chain approach which is based on
                 a simple swap operation. We give theoretical results on
                 the efficiency of different randomization methods, and
                 apply the swap randomization method to several
                 well-known datasets. Our results indicate that for some
                 datasets the structure discovered by the data mining
                 algorithms is expected, given the row and column
                 margins of the datasets, while for other datasets the
                 discovered structure conveys information that is not
                 captured by the margin counts.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "0--1 data; randomization tests; Significance testing;
                 swaps",
}

@Article{Tang:2008:TTA,
  author =       "Lei Tang and Huan Liu and Jianping Zhang and Nitin
                 Agarwal and John J. Salerno",
  title =        "Topic taxonomy adaptation for group profiling",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1324172.1324173",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A topic taxonomy is an effective representation that
                 describes salient features of virtual groups or online
                 communities. A topic taxonomy consists of topic nodes.
                 Each internal node is defined by its vertical path
                 (i.e., ancestor and child nodes) and its horizontal
                 list of attributes (or terms). In a text-dominant
                 environment, a topic taxonomy can be used to flexibly
                 describe a group's interests with varying granularity.
                 However, the stagnant nature of a taxonomy may fail to
                 timely capture the dynamic change of a group's
                 interest. This article addresses the problem of how to
                 adapt a topic taxonomy to the accumulated data that
                 reflects the change of a group's interest to achieve
                 dynamic group profiling. We first discuss the issues
                 related to topic taxonomy. We next formulate taxonomy
                 adaptation as an optimization problem to find the
                 taxonomy that best fits the data. We then present a
                 viable algorithm that can efficiently accomplish
                 taxonomy adaptation. We conduct extensive experiments
                 to evaluate our approach's efficacy for group
                 profiling, compare the approach with some alternatives,
                 and study its performance for dynamic group profiling.
                 While pointing out various applications of taxonomy
                 adaption, we suggest some future work that can take
                 advantage of burgeoning Web 2.0 services for online
                 targeted marketing, counterterrorism in connecting
                 dots, and community tracking.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "dynamic profiling; group interest; taxonomy
                 adjustment; text hierarchical classification; Topic
                 taxonomy",
}

@Article{Cormode:2008:FHH,
  author =       "Graham Cormode and Flip Korn and S. Muthukrishnan and
                 Divesh Srivastava",
  title =        "Finding hierarchical heavy hitters in streaming data",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1324172.1324174",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data items that arrive online as streams typically
                 have attributes which take values from one or more
                 hierarchies (time and geographic location, source and
                 destination IP addresses, etc.). Providing an aggregate
                 view of such data is important for summarization,
                 visualization, and analysis. We develop an aggregate
                 view based on certain organized sets of large-valued
                 regions (``heavy hitters'') corresponding to
                 hierarchically discounted frequency counts. We formally
                 define the notion of {\em hierarchical heavy hitters\/}
                 (HHHs). We first consider computing (approximate) HHHs
                 over a data stream drawn from a single hierarchical
                 attribute. We formalize the problem and give
                 deterministic algorithms to find them in a single pass
                 over the input.\par

                 In order to analyze a wider range of realistic data
                 streams (e.g., from IP traffic-monitoring
                 applications), we generalize this problem to multiple
                 dimensions. Here, the semantics of HHHs are more
                 complex, since a ``child'' node can have multiple
                 ``parent'' nodes. We present online algorithms that
                 find approximate HHHs in one pass, with provable
                 accuracy guarantees. The product of hierarchical
                 dimensions forms a mathematical lattice structure. Our
                 algorithms exploit this structure, and so are able to
                 track approximate HHHs using only a small, fixed number
                 of statistics per stored item, regardless of the number
                 of dimensions.\par

                 We show experimentally, using real data, that our
                 proposed algorithms yields outputs which are very
                 similar (virtually identical, in many cases) to offline
                 computations of the exact solutions, whereas
                 straightforward heavy-hitters-based approaches give
                 significantly inferior answer quality. Furthermore, the
                 proposed algorithms result in an order of magnitude
                 savings in data structure size while performing
                 competitively.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "approximation algorithms; Data mining; network data
                 analysis",
}

@Article{Somaiya:2008:LCU,
  author =       "Manas Somaiya and Christopher Jermaine and Sanjay
                 Ranka",
  title =        "Learning correlations using the mixture-of-subsets
                 model",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1324172.1324175",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Using a mixture of random variables to model data is a
                 tried-and-tested method common in data mining, machine
                 learning, and statistics. By using mixture modeling it
                 is often possible to accurately model even complex,
                 multimodal data via very simple components. However,
                 the classical mixture model assumes that a data point
                 is generated by a single component in the model. A lot
                 of datasets can be modeled closer to the underlying
                 reality if we drop this restriction. We propose a
                 probabilistic framework, the {\em mixture-of-subsets
                 (MOS) model}, by making two fundamental changes to the
                 classical mixture model. First, we allow a data point
                 to be generated by a set of components, rather than
                 just a single component. Next, we limit the number of
                 data attributes that each component can influence. We
                 also propose an EM framework to learn the MOS model
                 from a dataset, and experimentally evaluate it on real,
                 high-dimensional datasets. Our results show that the
                 MOS model learned from the data represents the
                 underlying nature of the data accurately.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "EM algorithm; high-dimensional data; Mixture
                 modeling",
}

@Article{Halkidi:2008:CFB,
  author =       "M. Halkidi and D. Gunopulos and M. Vazirgiannis and N.
                 Kumar and C. Domeniconi",
  title =        "A clustering framework based on subjective and
                 objective validity criteria",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1324172.1324176",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering, as an unsupervised learning process is a
                 challenging problem, especially in cases of
                 high-dimensional datasets. Clustering result quality
                 can benefit from user constraints and objective
                 validity assessment. In this article, we propose a
                 semisupervised framework for learning the weighted
                 Euclidean subspace, where the best clustering can be
                 achieved. Our approach capitalizes on: (i) user
                 constraints; and (ii) the quality of intermediate
                 clustering results in terms of their structural
                 properties. The proposed framework uses the clustering
                 algorithm and the validity measure as its parameters.
                 We develop and discuss algorithms for learning and
                 tuning the weights of contributing dimensions and
                 defining the ``best'' clustering obtained by satisfying
                 user constraints. Experimental results on benchmark
                 datasets demonstrate the superiority of the proposed
                 approach in terms of improved clustering accuracy.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "cluster validity; data mining; Semisupervised
                 learning; similarity measure learning; space learning",
}

@Article{Zaki:2008:ISI,
  author =       "Mohammed J. Zaki and George Karypis and Jiong Yang and
                 Wei Wang",
  title =        "Introduction to special issue on bioinformatics",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1342320.1342321",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Jin:2008:CMM,
  author =       "Ying Jin and T. M. Murali and Naren Ramakrishnan",
  title =        "Compositional mining of multirelational biological
                 datasets",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1342320.1342322",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "High-throughput biological screens are yielding
                 ever-growing streams of information about multiple
                 aspects of cellular activity. As more and more
                 categories of datasets come online, there is a
                 corresponding multitude of ways in which inferences can
                 be chained across them, motivating the need for
                 compositional data mining algorithms. In this article,
                 we argue that such compositional data mining can be
                 effectively realized by functionally cascading
                 redescription mining and biclustering algorithms as
                 primitives. Both these primitives mirror shifts of
                 vocabulary that can be composed in arbitrary ways to
                 create rich chains of inferences. Given a relational
                 database and its schema, we show how the schema can be
                 automatically compiled into a compositional data mining
                 program, and how different domains in the schema can be
                 related through logical sequences of biclustering and
                 redescription invocations. This feature allows us to
                 rapidly prototype new data mining applications,
                 yielding greater understanding of scientific datasets.
                 We describe two applications of compositional data
                 mining: (i) matching terms across categories of the
                 Gene Ontology and (ii) understanding the molecular
                 mechanisms underlying stress response in human cells.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Biclustering; bioinformatics; compositional data
                 mining; inductive logic programming; redescription
                 mining",
}

@Article{Sahay:2008:DSB,
  author =       "Saurav Sahay and Sougata Mukherjea and Eugene
                 Agichtein and Ernest V. Garcia and Shamkant B. Navathe
                 and Ashwin Ram",
  title =        "Discovering semantic biomedical relations utilizing
                 the {Web}",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1342320.1342323",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "To realize the vision of a Semantic Web for Life
                 Sciences, discovering relations between resources is
                 essential. It is very difficult to automatically
                 extract relations from Web pages expressed in natural
                 language formats. On the other hand, because of the
                 explosive growth of information, it is difficult to
                 manually extract the relations. In this paper we
                 present techniques to automatically discover relations
                 between biomedical resources from the Web. For this
                 purpose we retrieve relevant information from Web
                 Search engines and Pubmed database using various
                 lexico-syntactic patterns as queries over SOAP web
                 services. The patterns are initially handcrafted but
                 can be progressively learnt. The extracted relations
                 can be used to construct and augment ontologies and
                 knowledge bases. Experiments are presented for general
                 biomedical relation discovery and domain specific
                 search to show the usefulness of our technique.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Ontology construction; relation identification",
}

@Article{Ye:2008:DSA,
  author =       "Jieping Ye and Jianhui Chen and Ravi Janardan and
                 Sudhir Kumar",
  title =        "Developmental stage annotation of {Drosophila} gene
                 expression pattern images via an entire solution path
                 for {LDA}",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1342320.1342324",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/string-matching.bib;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Gene expression in a developing embryo occurs in
                 particular cells (spatial patterns) in a time-specific
                 manner (temporal patterns), which leads to the
                 differentiation of cell fates. Images of a {\em
                 Drosophila melanogaster\/} embryo at a given
                 developmental stage, showing a particular gene
                 expression pattern revealed by a gene-specific probe,
                 can be compared for spatial overlaps. The comparison is
                 fundamentally important to formulating and testing gene
                 interaction hypotheses. Expression pattern comparison
                 is most biologically meaningful when images from a
                 similar time point (developmental stage) are compared.
                 In this paper, we present LdaPath, a novel formulation
                 of Linear Discriminant Analysis (LDA) for automatic
                 developmental stage range classification. It employs
                 multivariate linear regression with the {$ L_1 $}-norm
                 penalty controlled by a regularization parameter for
                 feature extraction and visualization. LdaPath computes
                 an entire solution path for all values of
                 regularization parameter with essentially the same
                 computational cost as fitting one LDA model. Thus, it
                 facilitates efficient model selection. It is based on
                 the equivalence relationship between LDA and the least
                 squares method for multiclass classifications. This
                 equivalence relationship is established under a mild
                 condition, which we show empirically to hold for many
                 high-dimensional datasets, such as expression pattern
                 images. Our experiments on a collection of 2705
                 expression pattern images show the effectiveness of the
                 proposed algorithm. Results also show that the LDA
                 model resulting from LdaPath is sparse, and irrelevant
                 features may be removed. Thus, LdaPath provides a
                 general framework for simultaneous feature selection
                 and feature extraction.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "dimensionality reduction; Gene expression pattern
                 image; linear discriminant analysis; linear
                 regression",
}

@Article{Lu:2008:ADA,
  author =       "Yijuan Lu and Qi Tian and Jennifer Neary and Feng Liu
                 and Yufeng Wang",
  title =        "Adaptive discriminant analysis for microarray-based
                 classification",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1342320.1342325",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Microarray technology has generated enormous amounts
                 of high-dimensional gene expression data, providing a
                 unique platform for exploring gene regulatory networks.
                 However, the curse of dimensionality plagues effort to
                 analyze these high throughput data. Linear Discriminant
                 Analysis (LDA) and Biased Discriminant Analysis (BDA)
                 are two popular techniques for dimension reduction,
                 which pay attention to different roles of the positive
                 and negative samples in finding discriminating
                 subspace. However, the drawbacks of these two methods
                 are obvious: LDA has limited efficiency in classifying
                 sample data from subclasses with different
                 distributions, and BDA does not account for the
                 underlying distribution of negative samples.\par

                 In this paper, we propose a novel dimension reduction
                 technique for microarray analysis: Adaptive
                 Discriminant Analysis (ADA), which effectively exploits
                 favorable attributes of both BDA and LDA and avoids
                 their unfavorable ones. ADA can find a good
                 discriminative subspace with adaptation to different
                 sample distributions. It not only alleviates the
                 problem of high dimensionality, but also enhances the
                 classification performance in the subspace with
                 na{\"\i}ve Bayes classifier. To learn the best model
                 fitting the real scenario, boosted Adaptive
                 Discriminant Analysis is further proposed. Extensive
                 experiments on the yeast cell cycle regulation data
                 set, and the expression data of the red blood cell
                 cycle in malaria parasite {\em Plasmodium falciparum\/}
                 demonstrate the superior performance of ADA and boosted
                 ADA. We also present some putative genes of specific
                 functional classes predicted by boosted ADA. Their
                 potential functionality is confirmed by independent
                 predictions based on Gene Ontology, demonstrating that
                 ADA and boosted ADA are effective dimension reduction
                 methods for microarray-based classification.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "ADA; BDA; boosted ADA; dimension reduction; LDA;
                 microarray",
}

@Article{Hashimoto:2008:NEP,
  author =       "Kosuke Hashimoto and Kiyoko Flora Aoki-Kinoshita and
                 Nobuhisa Ueda and Minoru Kanehisa and Hiroshi
                 Mamitsuka",
  title =        "A new efficient probabilistic model for mining labeled
                 ordered trees applied to glycobiology",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1342320.1342326",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Mining frequent patterns from large datasets is an
                 important issue in data mining. Recently, complex and
                 unstructured (or semi-structured) datasets have
                 appeared as targets for major data mining applications,
                 including text mining, web mining and bioinformatics.
                 Our work focuses on labeled ordered trees, which are
                 typically semi-structured datasets. In bioinformatics,
                 carbohydrate sugar chains, or glycans, can be modeled
                 as labeled ordered trees. Glycans are the third major
                 class of biomolecules, having important roles in
                 signaling and recognition. For mining labeled ordered
                 trees, we propose a new probabilistic model and its
                 efficient learning scheme which significantly improves
                 the time and space complexity of an existing
                 probabilistic model for labeled ordered trees. We
                 evaluated the performance of the proposed model,
                 comparing it with those of other probabilistic models,
                 using synthetic as well as real datasets from
                 glycobiology. Experimental results showed that the
                 proposed model drastically reduced the computation time
                 of the competing model, keeping the predictive power
                 and avoiding overfitting to the training data. Finally,
                 we assessed our results on real data from a variety of
                 biological viewpoints, verifying known facts in
                 glycobiology.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Expectation-maximization; labeled ordered trees;
                 maximum likelihood; probabilistic models",
}

@Article{Ge:2008:JCA,
  author =       "Rong Ge and Martin Ester and Byron J. Gao and Zengjian
                 Hu and Binay Bhattacharya and Boaz Ben-Moshe",
  title =        "Joint cluster analysis of attribute data and
                 relationship data: {The} connected $k$-center problem,
                 algorithms and applications",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1376815.1376816",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Attribute data and relationship data are two principal
                 types of data, representing the intrinsic and extrinsic
                 properties of entities. While attribute data have been
                 the main source of data for cluster analysis,
                 relationship data such as social networks or metabolic
                 networks are becoming increasingly available. It is
                 also common to observe both data types carry
                 complementary information such as in market
                 segmentation and community identification, which calls
                 for a joint cluster analysis of both data types so as
                 to achieve better results. In this article, we
                 introduce the novel Connected $k$-Center ({\em CkC\/})
                 problem, a clustering model taking into account
                 attribute data as well as relationship data. We analyze
                 the complexity of the problem and prove its
                 NP-hardness. Therefore, we analyze the approximability
                 of the problem and also present a constant factor
                 approximation algorithm. For the special case of the
                 {\em CkC\/} problem where the relationship data form a
                 tree structure, we propose a dynamic programming method
                 giving an optimal solution in polynomial time. We
                 further present NetScan, a heuristic algorithm that is
                 efficient and effective for large real databases. Our
                 extensive experimental evaluation on real datasets
                 demonstrates the meaningfulness and accuracy of the
                 NetScan results.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "approximation algorithms; Attribute data; community
                 identification; document clustering; joint cluster
                 analysis; market segmentation; NP-hardness;
                 relationship data",
}

@Article{Gupta:2008:BBC,
  author =       "Gunjan Gupta and Joydeep Ghosh",
  title =        "{Bregman} bubble clustering: a robust framework for
                 mining dense clusters",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1376815.1376817",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In classical clustering, each data point is assigned
                 to at least one cluster. However, in many applications
                 only a small subset of the available data is relevant
                 for the problem and the rest needs to be ignored in
                 order to obtain good clusters. Certain nonparametric
                 density-based clustering methods find the most relevant
                 data as multiple dense regions, but such methods are
                 generally limited to low-dimensional data and do not
                 scale well to large, high-dimensional datasets. Also,
                 they use a specific notion of ``distance'', typically
                 Euclidean or Mahalanobis distance, which further limits
                 their applicability. On the other hand, the recent One
                 Class Information Bottleneck (OC-IB) method is fast and
                 works on a large class of distortion measures known as
                 Bregman Divergences, but can only find a {\em single\/}
                 dense region. This article presents a broad framework
                 for finding $k$ dense clusters while ignoring the rest
                 of the data. It includes a seeding algorithm that can
                 automatically determine a suitable value for {\em k}.
                 When $k$ is forced to 1, our method gives rise to an
                 improved version of OC-IB with optimality guarantees.
                 We provide a generative model that yields the proposed
                 iterative algorithm for finding $k$ dense regions as a
                 special case. Our analysis reveals an interesting and
                 novel connection between the problem of finding dense
                 regions and exponential mixture models; a hard model
                 corresponding to $k$ exponential mixtures with a
                 uniform background results in a set of $k$ dense
                 clusters. The proposed method describes a highly
                 scalable algorithm for finding multiple dense regions
                 that works with any Bregman Divergence, thus extending
                 density based clustering to a variety of non-Euclidean
                 problems not addressable by earlier methods. We present
                 empirical results on three artificial, two microarray
                 and one text dataset to show the relevance and
                 effectiveness of our methods.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Bregman divergences; Density-based clustering;
                 expectation maximization; exponential family; One Class
                 classification",
}

@Article{Tan:2008:TMG,
  author =       "Henry Tan and Fedja Hadzic and Tharam S. Dillon and
                 Elizabeth Chang and Ling Feng",
  title =        "Tree model guided candidate generation for mining
                 frequent subtrees from {XML} documents",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1376815.1376818",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Due to the inherent flexibilities in both structure
                 and semantics, XML association rules mining faces few
                 challenges, such as: a more complicated hierarchical
                 data structure and ordered data context. Mining
                 frequent patterns from XML documents can be recast as
                 mining frequent tree structures from a database of XML
                 documents. In this study, we model a database of XML
                 documents as a database of rooted labeled ordered
                 subtrees. In particular, we are mainly concerned with
                 mining frequent induced and embedded ordered subtrees.
                 Our main contributions are as follows. We describe our
                 unique {\em embedding list\/} representation of the
                 tree structure, which enables efficient implementation
                 of our {\em Tree Model Guided\/} ({\em TMG\/})
                 candidate generation. {\em TMG\/} is an optimal,
                 nonredundant enumeration strategy that enumerates all
                 the valid candidates that conform to the structural
                 aspects of the data. We show through a mathematical
                 model and experiments that {\em TMG\/} has better
                 complexity compared to the commonly used join approach.
                 In this article, we propose two algorithms, MB3-Miner
                 and iMB3-Miner. MB3-Miner mines embedded subtrees.
                 iMB3-Miner mines induced and/or embedded subtrees by
                 using the {\em maximum level of embedding constraint}.
                 Our experiments with both synthetic and real datasets
                 against two well-known algorithms for mining induced
                 and embedded subtrees, demonstrate the effectiveness
                 and the efficiency of the proposed techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "FREQT; TMG; Tree mining; tree model guided;
                 TreeMiner",
}

@Article{Islam:2008:STS,
  author =       "Aminul Islam and Diana Inkpen",
  title =        "Semantic text similarity using corpus-based word
                 similarity and string similarity",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1376815.1376819",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present a method for measuring the semantic
                 similarity of texts using a corpus-based measure of
                 semantic word similarity and a normalized and modified
                 version of the Longest Common Subsequence (LCS) string
                 matching algorithm. Existing methods for computing text
                 similarity have focused mainly on either large
                 documents or individual words. We focus on computing
                 the similarity between two sentences or two short
                 paragraphs. The proposed method can be exploited in a
                 variety of applications involving textual knowledge
                 representation and knowledge discovery. Evaluation
                 results on two different data sets show that our method
                 outperforms several competing methods.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "corpus-based measures; Semantic similarity of words;
                 similarity of short texts",
}

@Article{Sun:2008:ITA,
  author =       "Jimeng Sun and Dacheng Tao and Spiros Papadimitriou
                 and Philip S. Yu and Christos Faloutsos",
  title =        "Incremental tensor analysis: {Theory} and
                 applications",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1409620.1409621",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How do we find patterns in author-keyword
                 associations, evolving over time? Or in data cubes
                 (tensors), with product-branchcustomer sales
                 information? And more generally, how to summarize
                 high-order data cubes (tensors)? How to incrementally
                 update these patterns over time? Matrix decompositions,
                 like principal component analysis (PCA) and variants,
                 are invaluable tools for mining, dimensionality
                 reduction, feature selection, rule identification in
                 numerous settings like streaming data, text, graphs,
                 social networks, and many more settings. However, they
                 have only two orders (i.e., matrices, like author and
                 keyword in the previous example).\par

                 We propose to envision such higher-order data as
                 tensors, and tap the vast literature on the topic.
                 However, these methods do not necessarily scale up, let
                 alone operate on semi-infinite streams. Thus, we
                 introduce a general framework, incremental tensor
                 analysis (ITA), which efficiently computes a compact
                 summary for high-order and high-dimensional data, and
                 also reveals the hidden correlations. Three variants of
                 ITA are presented: (1) dynamic tensor analysis (DTA);
                 (2) streaming tensor analysis (STA); and (3)
                 window-based tensor analysis (WTA). In particular, we
                 explore several fundamental design trade-offs such as
                 space efficiency, computational cost, approximation
                 accuracy, time dependency, and model complexity.\par

                 We implement all our methods and apply them in several
                 real settings, such as network anomaly detection,
                 multiway latent semantic indexing on citation networks,
                 and correlation study on sensor measurements. Our
                 empirical studies show that the proposed methods are
                 fast and accurate and that they find interesting
                 patterns and outliers on the real datasets.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "multilinear algebra; stream mining; Tensor",
}

@Article{Mangasarian:2008:PPC,
  author =       "Olvi L. Mangasarian and Edward W. Wild and Glenn M.
                 Fung",
  title =        "Privacy-preserving classification of vertically
                 partitioned data via random kernels",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1409620.1409622",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We propose a novel privacy-preserving support vector
                 machine (SVM) classifier for a data matrix $A$ whose
                 input feature columns are divided into groups belonging
                 to different entities. Each entity is unwilling to
                 share its group of columns or make it public. Our
                 classifier is based on the concept of a reduced kernel
                 $k(A, B\prime)$, where $B\prime$ is the transpose of a
                 random matrix $B$. The column blocks of $B$
                 corresponding to the different entities are privately
                 generated by each entity and never made public. The
                 proposed linear or nonlinear SVM classifier, which is
                 public but does not reveal any of the privately held
                 data, has accuracy comparable to that of an ordinary
                 SVM classifier that uses the entire set of input
                 features directly.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Privacy preserving classification; support vector
                 machines; vertically partitioned data",
}

@Article{Lakshmanan:2008:DRA,
  author =       "Laks V. S. Lakshmanan and Raymond T. Ng and Ganesh
                 Ramesh",
  title =        "On disclosure risk analysis of anonymized itemsets in
                 the presence of prior knowledge",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1409620.1409623",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Decision makers of companies often face the dilemma of
                 whether to release data for knowledge discovery,
                 vis-a-vis the risk of disclosing proprietary or
                 sensitive information. Among the various methods
                 employed for ``sanitizing'' the data prior to
                 disclosure, we focus in this article on anonymization,
                 given its widespread use in practice. We do due
                 diligence to the question ``just how safe is the
                 anonymized data?'' We consider both those scenarios
                 when the hacker has no information and, more
                 realistically, when the hacker may have partial
                 information about items in the domain. We conduct our
                 analyses in the context of frequent set mining and
                 address the safety question at two different levels:
                 (i) how likely of being cracked (i.e., re-identified by
                 a hacker), are the identities of individual items and
                 (ii) how likely are sets of items cracked? For
                 capturing the prior knowledge of the hacker, we propose
                 a {\em belief function}, which amounts to an educated
                 guess of the frequency of each item. For various
                 classes of belief functions which correspond to
                 different degrees of prior knowledge, we derive
                 formulas for computing the expected number of cracks of
                 single items and for itemsets, the probability of
                 cracking the itemsets. While obtaining, exact values
                 for more general situations is computationally hard, we
                 propose a series of heuristics called the {\em
                 O-estimates}. They are easy to compute and are shown
                 fairly accurate, justified by empirical results on real
                 benchmark datasets. Based on the O-estimates, we
                 propose a recipe for the decision makers to resolve
                 their dilemma. Our recipe operates at two different
                 levels, depending on whether the data owner wants to
                 reason in terms of single items or sets of items (or
                 both). Finally, we present techniques for ascertaining
                 a hacker's knowledge of correlation in terms of
                 co-occurrence of items likely. This information
                 regarding the hacker's knowledge can be incorporated
                 into our framework of disclosure risk analysis and we
                 present experimental results demonstrating how this
                 knowledge affects the heuristic estimates we have
                 developed.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "anonymization; belief function; bipartite graphs;
                 correlation; Disclosure risk; frequent itemsets;
                 hacker; matching; prior knowledge; sampling",
}

@Article{Vaidya:2008:PPD,
  author =       "Jaideep Vaidya and Chris Clifton and Murat
                 Kantarcioglu and A. Scott Patterson",
  title =        "Privacy-preserving decision trees over vertically
                 partitioned data",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1409620.1409624",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Privacy and security concerns can prevent sharing of
                 data, derailing data-mining projects. Distributed
                 knowledge discovery, if done correctly, can alleviate
                 this problem. We introduce a generalized
                 privacy-preserving variant of the ID3 algorithm for
                 vertically partitioned data distributed over two or
                 more parties. Along with a proof of security, we
                 discuss what would be necessary to make the protocols
                 completely secure. We also provide experimental
                 results, giving a first demonstration of the practical
                 complexity of secure multiparty computation-based data
                 mining.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Decision tree classification; privacy",
}

@Article{Chuang:2009:FPS,
  author =       "Kun-Ta Chuang and Hung-Leng Chen and Ming-Syan Chen",
  title =        "Feature-preserved sampling over streaming data",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "15:1--15:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1460797.1460798",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article, we explore a novel sampling model,
                 called {\em feature preserved sampling\/} ({\em FPS\/})
                 that sequentially generates a high-quality sample over
                 sliding windows. The sampling quality we consider
                 refers to the degree of consistency between the sample
                 proportion and the population proportion of each
                 attribute value in a window. Due to the time-variant
                 nature of real-world datasets, users are more likely to
                 be interested in the most recent data. However,
                 previous works have not been able to generate a
                 high-quality sample over sliding windows that precisely
                 preserves up-to-date population characteristics.
                 Motivated by this shortcoming, we have developed the
                 {\em FPS\/} algorithm, which has several advantages:
                 (1) it sequentially generates a sample from a
                 time-variant data source over sliding windows; (2) the
                 execution time of {\em FPS\/} is linear with respect to
                 the database size; (3) the {\em relative\/}
                 proportional differences between the sample proportions
                 and population proportions of most distinct attribute
                 values are guaranteed to be below a specified error
                 threshold, $\epsilon$ , while the {\em relative\/}
                 proportion differences of the remaining attribute
                 values are as close to $\epsilon$ as possible, which
                 ensures that the generated sample is of high quality;
                 (4) the sample rate is close to the user specified rate
                 so that a high quality sampling result can be obtained
                 without increasing the sample size; (5) by a thorough
                 analytical and empirical study, we prove that {\em
                 FPS\/} has acceptable space overheads, especially when
                 the attribute values have Zipfian distributions, and
                 {\em FPS\/} can also excellently preserve the
                 population proportion of multivariate features in the
                 sample; and (6) {\em FPS\/} can be applied to infinite
                 streams and finite datasets equally, and the generated
                 samples can be used for various applications. Our
                 experiments on both real and synthetic data validate
                 that {\em FPS\/} can effectively obtain a high quality
                 sample of the desired size. In addition, while using
                 the sample generated by {\em FPS\/} in various mining
                 applications, a significant improvement in efficiency
                 can be achieved without compromising the model's
                 precision.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "sampling; Streaming mining",
}

@Article{Jiang:2009:MFC,
  author =       "Daxin Jiang and Jian Pei",
  title =        "Mining frequent cross-graph quasi-cliques",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "16:1--16:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1460797.1460799",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Joint mining of multiple datasets can often discover
                 interesting, novel, and reliable patterns which cannot
                 be obtained solely from any single source. For example,
                 in bioinformatics, jointly mining multiple gene
                 expression datasets obtained by different labs or
                 during various biological processes may overcome the
                 heavy noise in the data. Moreover, by joint mining of
                 gene expression data and protein-protein interaction
                 data, we may discover clusters of genes which show
                 coherent expression patterns and also produce
                 interacting proteins. Such clusters may be potential
                 pathways.\par

                 In this article, we investigate a novel data mining
                 problem, {\em mining frequent cross-graph
                 quasi-cliques}, which is generalized from several
                 interesting applications in bioinformatics,
                 cross-market customer segmentation, social network
                 analysis, and Web mining. In a graph, a set of vertices
                 $S$ is a $\gamma$-quasi-clique $(0 < \gamma \leq 1)$ if
                 each vertex $v$ in $S$ directly connects to at least
                 $\gamma \cdot (|S| - 1)$ other vertices in $S$. Given a
                 set of graphs $G_1, \ldots{}, G_n$ and parameter ${\rm
                 min\_sup} (0 < {\rm min\_sup} 1)$, a set of vertices
                 $S$ is a frequent cross-graph quasi-clique if $S$ is a
                 $\gamma$-quasi-clique in at least ${\rm min\_sup} \cdot
                 n$ graphs, and there does not exist a proper superset
                 of $S$ having the property.\par

                 We build a general model, show why the complete set of
                 frequent cross-graph quasi-cliques cannot be found by
                 previous data mining methods, and study the complexity
                 of the problem. While the problem is difficult, we
                 develop practical algorithms which exploit several
                 interesting and effective techniques and heuristics to
                 efficaciously mine frequent cross-graph quasi-cliques.
                 A systematic performance study is reported on both
                 synthetic and real data sets. We demonstrate some
                 interesting and meaningful frequent cross-graph
                 quasi-cliques in bioinformatics. The experimental
                 results also show that our algorithms are efficient and
                 scalable.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "bioinformatics; clique; Graph mining; joint mining",
}

@Article{Domeniconi:2009:WCE,
  author =       "Carlotta Domeniconi and Muna Al-Razgan",
  title =        "Weighted cluster ensembles: {Methods} and analysis",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "17:1--17:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1460797.1460800",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Cluster ensembles offer a solution to challenges
                 inherent to clustering arising from its ill-posed
                 nature. Cluster ensembles can provide robust and stable
                 solutions by leveraging the consensus across multiple
                 clustering results, while averaging out emergent
                 spurious structures that arise due to the various
                 biases to which each participating algorithm is tuned.
                 In this article, we address the problem of combining
                 multiple {\em weighted clusters\/} that belong to
                 different subspaces of the input space. We leverage the
                 diversity of the input clusterings in order to generate
                 a consensus partition that is superior to the
                 participating ones. Since we are dealing with weighted
                 clusters, our consensus functions make use of the
                 weight vectors associated with the clusters. We
                 demonstrate the effectiveness of our techniques by
                 running experiments with several real datasets,
                 including high-dimensional text data. Furthermore, we
                 investigate in depth the issue of diversity and
                 accuracy for our ensemble methods. Our analysis and
                 experimental results show that the proposed techniques
                 are capable of producing a partition that is as good as
                 or better than the best individual clustering.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "accuracy and diversity measures; Cluster ensembles;
                 consensus functions; data mining; subspace clustering;
                 text data",
}

@Article{Zhang:2009:DGA,
  author =       "Zhenjie Zhang and Laks V. S. Lakshmanan and Anthony K.
                 H. Tung",
  title =        "On domination game analysis for microeconomic data
                 mining",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1460797.1460801",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Game theory is a powerful tool for analyzing the
                 competitions among manufacturers in a market. In this
                 article, we present a study on combining game theory
                 and data mining by introducing the concept of
                 domination game analysis. We present a multidimensional
                 market model, where every dimension represents one
                 attribute of a commodity. Every product or customer is
                 represented by a point in the multidimensional space,
                 and a product is said to ``dominate'' a customer if all
                 of its attributes can satisfy the requirements of the
                 customer. The expected market share of a product is
                 measured by the expected number of the buyers in the
                 customers, all of which are equally likely to buy any
                 product dominating him. A Nash equilibrium is a
                 configuration of the products achieving stable expected
                 market shares for all products. We prove that Nash
                 equilibrium in such a model can be computed in
                 polynomial time if every manufacturer tries to modify
                 its product in a round robin manner. To further improve
                 the efficiency of the computation, we also design two
                 algorithms for the manufacturers to efficiently find
                 their best response to other products in the market.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "data mining; Domination game; game theory",
}

@Article{Kriegel:2009:CHD,
  author =       "Hans-Peter Kriegel and Peer Kr{\"o}ger and Arthur
                 Zimek",
  title =        "Clustering high-dimensional data: {A} survey on
                 subspace clustering, pattern-based clustering, and
                 correlation clustering",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1497577.1497578",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "As a prolific research area in data mining, subspace
                 clustering and related problems induced a vast quantity
                 of proposed solutions. However, many publications
                 compare a new proposition --- if at all --- with one or
                 two competitors, or even with a so-called
                 ``na{\"\i}ve'' ad hoc solution, but fail to clarify the
                 exact problem definition. As a consequence, even if two
                 solutions are thoroughly compared experimentally, it
                 will often remain unclear whether both solutions tackle
                 the same problem or, if they do, whether they agree in
                 certain tacit assumptions and how such assumptions may
                 influence the outcome of an algorithm. In this survey,
                 we try to clarify: (i) the different problem
                 definitions related to subspace clustering in general;
                 (ii) the specific difficulties encountered in this
                 field of research; (iii) the varying assumptions,
                 heuristics, and intuitions forming the basis of
                 different approaches; and (iv) how several prominent
                 solutions tackle different problems.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "clustering; high-dimensional data; Survey",
}

@Article{Dhurandhar:2009:SAM,
  author =       "Amit Dhurandhar and Alin Dobra",
  title =        "Semi-analytical method for analyzing models and model
                 selection measures based on moment analysis",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1497577.1497579",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article we propose a moment-based method for
                 studying models and model selection measures. By
                 focusing on the probabilistic space of classifiers
                 induced by the classification algorithm rather than on
                 that of datasets, we obtain efficient characterizations
                 for computing the moments, which is followed by
                 visualization of the resulting formulae that are too
                 complicated for direct interpretation. By assuming the
                 data to be drawn independently and identically
                 distributed from the underlying probability
                 distribution, and by going over the space of all
                 possible datasets, we establish general relationships
                 between the generalization error, hold-out-set error,
                 cross-validation error, and leave-one-out error. We
                 later exemplify the method and the results by studying
                 the behavior of the errors for the naive Bayes
                 classifier.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "classification; generalization error; Model
                 selection",
}

@Article{Cerf:2009:CPM,
  author =       "Lo{\"\i}c Cerf and J{\'e}r{\'e}my Besson and
                 C{\'e}line Robardet and Jean-Fran{\c{c}}ois Boulicaut",
  title =        "Closed patterns meet $n$-ary relations",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1497577.1497580",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Set pattern discovery from binary relations has been
                 extensively studied during the last decade. In
                 particular, many complete and efficient algorithms for
                 frequent closed set mining are now available.
                 Generalizing such a task to $n$-ary relations ($n \geq
                 2$) appears as a timely challenge. It may be important
                 for many applications, for example, when adding the
                 time dimension to the popular {\em objects\/} $\times$
                 {\em features\/} binary case. The generality of the
                 task (no assumption being made on the relation arity or
                 on the size of its attribute domains) makes it
                 computationally challenging. We introduce an algorithm
                 called Data-Peeler. From an $n$-ary relation, it
                 extracts all closed $n$-sets satisfying given piecewise
                 (anti) monotonic constraints. This new class of
                 constraints generalizes both monotonic and
                 antimonotonic constraints. Considering the special case
                 of ternary relations, Data-Peeler outperforms the
                 state-of-the-art algorithms CubeMiner and Trias by
                 orders of magnitude. These good performances must be
                 granted to a new clever enumeration strategy allowing
                 to efficiently enforce the closeness property. The
                 relevance of the extracted closed $n$-sets is assessed
                 on real-life 3-and 4-ary relations. Beyond natural 3-or
                 4-ary relations, expanding a relation with an
                 additional attribute can help in enforcing rather
                 abstract constraints such as the robustness with
                 respect to binarization. Furthermore, a collection of
                 closed $n$-sets is shown to be an excellent starting
                 point to compute a tiling of the dataset.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "$n$-ary relations; Closed patterns; constraint
                 properties; constraint-based mining; tiling",
}

@Article{Angiulli:2009:DEA,
  author =       "Fabrizio Angiulli and Fabio Fassetti",
  title =        "{DOLPHIN}: {An} efficient algorithm for mining
                 distance-based outliers in very large datasets",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1497577.1497581",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this work a novel distance-based outlier detection
                 algorithm, named DOLPHIN, working on disk-resident
                 datasets and whose I/O cost corresponds to the cost of
                 sequentially reading the input dataset file twice, is
                 presented.\par

                 It is both theoretically and empirically shown that the
                 main memory usage of DOLPHIN amounts to a small
                 fraction of the dataset and that DOLPHIN has linear
                 time performance with respect to the dataset size.
                 DOLPHIN gains efficiency by naturally merging together
                 in a unified schema three strategies, namely the
                 selection policy of objects to be maintained in main
                 memory, usage of pruning rules, and similarity search
                 techniques. Importantly, similarity search is
                 accomplished by the algorithm without the need of
                 preliminarily indexing the whole dataset, as other
                 methods do.\par

                 The algorithm is simple to implement and it can be used
                 with any type of data, belonging to either metric or
                 nonmetric spaces. Moreover, a modification to the basic
                 method allows DOLPHIN to deal with the scenario in
                 which the available buffer of main memory is smaller
                 than its standard requirements. DOLPHIN has been
                 compared with state-of-the-art distance-based outlier
                 detection algorithms, showing that it is much more
                 efficient.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Data mining; distance-based outliers; outlier
                 detection",
}

@Article{Chen:2009:BAS,
  author =       "Bee-Chung Chen and Raghu Ramakrishnan and Jude W.
                 Shavlik and Pradeep Tamma",
  title =        "Bellwether analysis: {Searching} for cost-effective
                 query-defined predictors in large databases",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1497577.1497582",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How to mine massive datasets is a challenging problem
                 with great potential value. Motivated by this
                 challenge, much effort has concentrated on developing
                 scalable versions of machine learning algorithms.
                 However, the cost of mining large datasets is not just
                 computational; preparing the datasets into the ``right
                 form'' so that learning algorithms can be applied is
                 usually costly, due to the human labor that is
                 typically required and a large number of choices in
                 data preparation, which include selecting different
                 subsets of data and aggregating data at different
                 granularities. We make the key observation that, for a
                 number of practically motivated problems, these choices
                 can be defined using database queries and analyzed in
                 an automatic and systematic manner. Specifically, we
                 propose a new class of data-mining problem, called {\em
                 bellwether analysis}, in which the goal is to find a
                 few query-defined predictors (e.g., first week sales of
                 Peoria, IL of an item) that can be used to accurately
                 predict the result of a target query (e.g., first year
                 worldwide sales of the item) from a large number of
                 queries that define candidate predictors. To make a
                 prediction for a new item, the data needed to generate
                 such predictors has to be collected (e.g., selling the
                 new item in Peoria, IL for a week and collecting the
                 sales data). A useful predictor is one that has high
                 prediction accuracy and a low data-collection cost. We
                 call such a cost-effective predictor a {\em
                 bellwether}.\par

                 This article introduces bellwether analysis, which
                 integrates database query processing and predictive
                 modeling into a single framework, and provides scalable
                 algorithms for large datasets that cannot fit in main
                 memory. Through a series of extensive experiments, we
                 show that bellwethers do exist in real-world databases,
                 and that our computation techniques achieve good
                 efficiency on large datasets.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "bellwether; Cost-effective prediction; data cube; OLAP
                 queries; predictive models; scalable algorithms",
}

@Article{Liu:2009:ISI,
  author =       "Huan Liu and John Salerno and Michael Young and Rakesh
                 Agrawal and Philip S. Yu",
  title =        "Introduction to special issue on social computing,
                 behavioral modeling, and prediction",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1514888.1514889",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Mehler:2009:ENC,
  author =       "Andrew Mehler and Steven Skiena",
  title =        "Expanding network communities from representative
                 examples",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1514888.1514890",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present an approach to leverage a small subset of a
                 coherent community within a social network into a much
                 larger, more representative sample. Our problem becomes
                 identifying a small conductance subgraph containing
                 many (but not necessarily all) members of the given
                 seed set. Starting with an initial seed set
                 representing a sample of a community, we seek to
                 discover as much of the full community as
                 possible.\par

                 We present a general method for network community
                 expansion, demonstrating that our methods work well in
                 expanding communities in real world networks starting
                 from small given seed groups (20 to 400 members). Our
                 approach is marked by incremental expansion from the
                 seeds with retrospective analysis to determine the
                 ultimate boundaries of our community. We demonstrate
                 how to increase the robustness of the general approach
                 through bootstrapping multiple random partitions of the
                 input set into seed and evaluation groups.\par

                 We go beyond statistical comparisons against gold
                 standards to careful subjective evaluations of our
                 expanded communities. This process explains the causes
                 of most disagreement between our expanded communities
                 and our gold-standards --- arguing that our expansion
                 methods provide more reliable communities than can be
                 extracted from reference sources/gazetteers such as
                 Wikipedia.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "artificial intelligence; community discovery; Discrete
                 mathematics; graph theory; news analysis; social
                 networks",
}

@Article{Lin:2009:ACT,
  author =       "Yu-Ru Lin and Yun Chi and Shenghuo Zhu and Hari
                 Sundaram and Belle L. Tseng",
  title =        "Analyzing communities and their evolutions in dynamic
                 social networks",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1514888.1514891",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We discover communities from social network data and
                 analyze the community evolution. These communities are
                 inherent characteristics of human interaction in online
                 social networks, as well as paper citation networks.
                 Also, communities may evolve over time, due to changes
                 to individuals' roles and social status in the network
                 as well as changes to individuals' research interests.
                 We present an innovative algorithm that deviates from
                 the traditional two-step approach to analyze community
                 evolutions. In the traditional approach, communities
                 are first detected for each time slice, and then
                 compared to determine correspondences. We argue that
                 this approach is inappropriate in applications with
                 noisy data. In this paper, we propose {\em FacetNet\/}
                 for analyzing communities and their evolutions through
                 a robust {\em unified\/} process. This novel framework
                 will discover communities and capture their evolution
                 with temporal smoothness given by historic community
                 structures. Our approach relies on formulating the
                 problem in terms of maximum a posteriori (MAP)
                 estimation, where the community structure is estimated
                 both by the observed networked data and by the prior
                 distribution given by historic community structures.
                 Then we develop an iterative algorithm, with proven low
                 time complexity, which is guaranteed to converge to an
                 optimal solution. We perform extensive experimental
                 studies, on both synthetic datasets and real datasets,
                 to demonstrate that our method discovers meaningful
                 communities and provides additional insights not
                 directly obtainable from traditional methods.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Community; community net; evolution; evolution net;
                 nonnegative matrix factorization; soft membership",
}

@Article{Kimura:2009:BLM,
  author =       "Masahiro Kimura and Kazumi Saito and Hiroshi Motoda",
  title =        "Blocking links to minimize contamination spread in a
                 social network",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1514888.1514892",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We address the problem of minimizing the propagation
                 of undesirable things, such as computer viruses or
                 malicious rumors, by blocking a limited number of links
                 in a network, which is converse to the influence
                 maximization problem in which the most influential
                 nodes for information diffusion is searched in a social
                 network. This minimization problem is more fundamental
                 than the problem of preventing the spread of
                 contamination by removing nodes in a network. We
                 introduce two definitions for the contamination degree
                 of a network, accordingly define two contamination
                 minimization problems, and propose methods for
                 efficiently finding good approximate solutions to these
                 problems on the basis of a naturally greedy strategy.
                 Using large social networks, we experimentally
                 demonstrate that the proposed methods outperform
                 conventional link-removal methods. We also show that
                 unlike the case of blocking a limited number of nodes,
                 the strategy of removing nodes with high out-degrees is
                 not necessarily effective for these problems.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Contamination diffusion; link analysis; social
                 networks",
}

@Article{Agichtein:2009:MIS,
  author =       "Eugene Agichtein and Yandong Liu and Jiang Bian",
  title =        "Modeling information-seeker satisfaction in community
                 question answering",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1514888.1514893",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Question Answering Communities such as Naver, Baidu
                 Knows, and Yahoo! Answers have emerged as popular, and
                 often effective, means of information seeking on the
                 web. By posting questions for other participants to
                 answer, information seekers can obtain specific answers
                 to their questions. Users of CQA portals have already
                 contributed millions of questions, and received
                 hundreds of millions of answers from other
                 participants. However, CQA is not always effective: in
                 some cases, a user may obtain a perfect answer within
                 minutes, and in others it may require hours --- and
                 sometimes days --- until a satisfactory answer is
                 contributed. We investigate the problem of predicting
                 information seeker satisfaction in collaborative
                 question answering communities, where we attempt to
                 predict whether a question author will be satisfied
                 with the answers submitted by the community
                 participants. We present a general prediction model,
                 and develop a variety of content, structure, and
                 community-focused features for this task. Our
                 experimental results, obtained from a large-scale
                 evaluation over thousands of real questions and user
                 ratings, demonstrate the feasibility of modeling and
                 predicting asker satisfaction. We complement our
                 results with a thorough investigation of the
                 interactions and information seeking patterns in
                 question answering communities that correlate with
                 information seeker satisfaction. We also explore {\em
                 personalized\/} models of asker satisfaction, and show
                 that when sufficient interaction history exists,
                 personalization can significantly improve prediction
                 accuracy over a ``one-size-fits-all'' model. Our models
                 and predictions could be useful for a variety of
                 applications, such as user intent inference, answer
                 ranking, interface design, and query suggestion and
                 routing.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Community question answering; information seeker
                 satisfaction",
}

@Article{Torvik:2009:AND,
  author =       "Vetle I. Torvik and Neil R. Smalheiser",
  title =        "Author name disambiguation in {MEDLINE}",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1552303.1552304",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "{\em Background\/}: We recently described
                 ``Author-ity,'' a model for estimating the probability
                 that two articles in MEDLINE, sharing the same author
                 name, were written by the same individual. Features
                 include shared title words, journal name, coauthors,
                 medical subject headings, language, affiliations, and
                 author name features (middle initial, suffix, and
                 prevalence in MEDLINE). Here we test the hypothesis
                 that the Author-ity model will suffice to disambiguate
                 author names for the vast majority of articles in
                 MEDLINE. {\em Methods\/}: Enhancements include: (a)
                 incorporating first names and their variants, email
                 addresses, and correlations between specific last names
                 and affiliation words; (b) new methods of generating
                 large unbiased training sets; (c) new methods for
                 estimating the prior probability; (d) a weighted least
                 squares algorithm for correcting transitivity
                 violations; and (e) a maximum likelihood based
                 agglomerative algorithm for computing clusters of
                 articles that represent inferred author-individuals.
                 {\em Results\/}: Pairwise comparisons were computed for
                 all author names on all 15.3 million articles in
                 MEDLINE (2006 baseline), that share last name and first
                 initial, to create Author-ity 2006, a database that has
                 each name on each article assigned to one of 6.7
                 million inferred author-individual clusters. Recall is
                 estimated at $\approx 98.8\%$. Lumping (putting two
                 different individuals into the same cluster) affects
                 $\approx 0.5\%$ of clusters, whereas splitting
                 (assigning articles written by the same individual to
                 $> 1$ cluster) affects $\approx 2\%$ of articles. {\em
                 Impact\/}: The Author-ity model can be applied
                 generally to other bibliographic databases. Author name
                 disambiguation allows information retrieval and data
                 integration to become {\em person-centered}, not just
                 {\em document-centered}, setting the stage for new data
                 mining and social network tools that will facilitate
                 the analysis of scholarly publishing and collaboration
                 behavior. {\em Availability\/}: The Author-ity 2006
                 database is available for nonprofit academic research,
                 and can be freely queried via
                 http://arrowsmith.psych.uic.edu.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "bibliographic databases; Name disambiguation",
}

@Article{Tu:2009:SDC,
  author =       "Li Tu and Yixin Chen",
  title =        "Stream data clustering based on grid density and
                 attraction",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1552303.1552305",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering real-time stream data is an important and
                 challenging problem. Existing algorithms such as
                 CluStream are based on the {\em k\/} -means algorithm.
                 These clustering algorithms have difficulties finding
                 clusters of arbitrary shapes and handling outliers.
                 Further, they require the knowledge of {\em k\/} and
                 user-specified time window. To address these issues,
                 this article proposes {\em D-Stream}, a framework for
                 clustering stream data using a density-based
                 approach.\par

                 Our algorithm uses an online component that maps each
                 input data record into a grid and an offline component
                 that computes the grid density and clusters the grids
                 based on the density. The algorithm adopts a density
                 decaying technique to capture the dynamic changes of a
                 data stream and a attraction-based mechanism to
                 accurately generate cluster boundaries.\par

                 Exploiting the intricate relationships among the decay
                 factor, attraction, data density, and cluster
                 structure, our algorithm can efficiently and
                 effectively generate and adjust the clusters in real
                 time. Further, a theoretically sound technique is
                 developed to detect and remove sporadic grids mapped by
                 outliers in order to dramatically improve the space and
                 time efficiency of the system. The technique makes
                 high-speed data stream clustering feasible without
                 degrading the clustering quality. The experimental
                 results show that our algorithm has superior quality
                 and efficiency, can find clusters of arbitrary shapes,
                 and can accurately recognize the evolving behaviors of
                 real-time data streams.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "clustering; data mining; density-based algorithms;
                 Stream data",
}

@Article{Zhou:2009:LST,
  author =       "Bin Zhou and Jian Pei",
  title =        "Link spam target detection using page farms",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1552303.1552306",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Currently, most popular Web search engines adopt some
                 link-based ranking methods such as PageRank. Driven by
                 the huge potential benefit of improving rankings of Web
                 pages, many tricks have been attempted to boost page
                 rankings. The most common way, which is known as link
                 spam, is to make up some artificially designed link
                 structures. Detecting link spam effectively is a big
                 challenge. In this article, we develop novel and
                 effective detection methods for link spam target pages
                 using page farms. The essential idea is intuitive:
                 whether a page is the beneficiary of link spam is
                 reflected by how it collects its PageRank score.
                 Technically, how a target page collects its PageRank
                 score is modeled by a page farm, which consists of
                 pages contributing a major portion of the PageRank
                 score of the target page. We propose two spamicity
                 measures based on page farms. They can be used as an
                 effective measure to check whether the pages are link
                 spam target pages. An empirical study using a newly
                 available real dataset strongly suggests that our
                 method is effective. It outperforms the
                 state-of-the-art methods like SpamRank and SpamMass in
                 both precision and recall.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Link Spam; Page Farm; PageRank",
}

@Article{Wan:2009:DBC,
  author =       "Li Wan and Wee Keong Ng and Xuan Hong Dang and Philip
                 S. Yu and Kuan Zhang",
  title =        "Density-based clustering of data streams at multiple
                 resolutions",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1552303.1552307",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In data stream clustering, it is desirable to have
                 algorithms that are able to detect clusters of
                 arbitrary shape, clusters that evolve over time, and
                 clusters with noise. Existing stream data clustering
                 algorithms are generally based on an online-offline
                 approach: The online component captures synopsis
                 information from the data stream (thus, overcoming
                 real-time and memory constraints) and the offline
                 component generates clusters using the stored synopsis.
                 The online-offline approach affects the overall
                 performance of stream data clustering in various ways:
                 the ease of deriving synopsis from streaming data; the
                 complexity of data structure for storing and managing
                 synopsis; and the frequency at which the offline
                 component is used to generate clusters. In this
                 article, we propose an algorithm that (1) computes and
                 updates synopsis information in constant time; (2)
                 allows users to discover clusters at multiple
                 resolutions; (3) determines the right time for users to
                 generate clusters from the synopsis information; (4)
                 generates clusters of higher purity than existing
                 algorithms; and (5) determines the right threshold
                 function for density-based clustering based on the
                 fading model of stream data. To the best of our
                 knowledge, no existing data stream algorithms has all
                 of these features. Experimental results show that our
                 algorithm is able to detect arbitrarily shaped,
                 evolving clusters with high quality.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Data mining algorithms; density based clustering;
                 evolving data streams",
}

@Article{Mannila:2009:ATS,
  author =       "Heikki Mannila and Dimitrios Gunopulos",
  title =        "{ACM TKDD} special issue {ACM SIGKDD 2007} and {ACM
                 SIGKDD 2008}",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1631162.1631163",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Asur:2009:EBF,
  author =       "Sitaram Asur and Srinivasan Parthasarathy and Duygu
                 Ucar",
  title =        "An event-based framework for characterizing the
                 evolutionary behavior of interaction graphs",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1631162.1631164",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Interaction graphs are ubiquitous in many fields such
                 as bioinformatics, sociology and physical sciences.
                 There have been many studies in the literature targeted
                 at studying and mining these graphs. However, almost
                 all of them have studied these graphs from a static
                 point of view. The study of the evolution of these
                 graphs over time can provide tremendous insight on the
                 behavior of entities, communities and the flow of
                 information among them. In this work, we present an
                 event-based characterization of critical behavioral
                 patterns for temporally varying interaction graphs. We
                 use nonoverlapping snapshots of interaction graphs and
                 develop a framework for capturing and identifying
                 interesting events from them. We use these events to
                 characterize complex behavioral patterns of individuals
                 and communities over time. We show how semantic
                 information can be incorporated to reason about
                 community-behavior events. We also demonstrate the
                 application of behavioral patterns for the purposes of
                 modeling evolution, link prediction and influence
                 maximization. Finally, we present a diffusion model for
                 evolving networks, based on our framework.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "diffusion of innovations; Dynamic interaction
                 networks; evolutionary analysis",
}

@Article{Chi:2009:ESC,
  author =       "Yun Chi and Xiaodan Song and Dengyong Zhou and Koji
                 Hino and Belle L. Tseng",
  title =        "On evolutionary spectral clustering",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1631162.1631165",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Evolutionary clustering is an emerging research area
                 essential to important applications such as clustering
                 dynamic Web and blog contents and clustering data
                 streams. In evolutionary clustering, a good clustering
                 result should fit the current data well, while
                 simultaneously not deviate too dramatically from the
                 recent history. To fulfill this dual purpose, a measure
                 of {\em temporal smoothness\/} is integrated in the
                 overall measure of clustering quality. In this article,
                 we propose two frameworks that incorporate temporal
                 smoothness in evolutionary spectral clustering. For
                 both frameworks, we start with intuitions gained from
                 the well-known {\em k\/} -means clustering problem, and
                 then propose and solve corresponding cost functions for
                 the evolutionary spectral clustering problems. Our
                 solutions to the evolutionary spectral clustering
                 problems provide more stable and consistent clustering
                 results that are less sensitive to short-term noises
                 while at the same time are adaptive to long-term
                 cluster drifts. Furthermore, we demonstrate that our
                 methods provide the optimal solutions to the relaxed
                 versions of the corresponding evolutionary {\em k\/}
                 -means clustering problems. Performance experiments
                 over a number of real and synthetic data sets
                 illustrate our evolutionary spectral clustering methods
                 provide more robust clustering results that are not
                 sensitive to noise and can adapt to data drifts.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Evolutionary spectral clustering; preserving cluster
                 membership; preserving cluster quality; temporal
                 smoothness",
}

@Article{Fujiwara:2009:FLS,
  author =       "Yasuhiro Fujiwara and Yasushi Sakurai and Masaru
                 Kitsuregawa",
  title =        "Fast likelihood search for hidden {Markov} models",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "18:1--18:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1631162.1631166",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Hidden Markov models (HMMs) are receiving considerable
                 attention in various communities and many applications
                 that use HMMs have emerged such as mental task
                 classification, biological analysis, traffic
                 monitoring, and anomaly detection. This article has two
                 goals; The first goal is exact and efficient
                 identification of the model whose state sequence has
                 the highest likelihood for the given query sequence
                 (more precisely, no HMM that actually has a
                 high-probability path for the given sequence is missed
                 by the algorithm), and the second goal is exact and
                 efficient monitoring of streaming data sequences to
                 find the best model. We propose SPIRAL, a fast search
                 method for HMM datasets. SPIRAL is based on three
                 ideas; (1) it clusters states of models to compute
                 approximate likelihood, (2) it uses several
                 granularities and approximates likelihood values in
                 search processing, and (3) it focuses on just the
                 promising likelihood computations by pruning out
                 low-likelihood state sequences. Experiments verify the
                 effectiveness of SPIRAL and show that it is more than
                 490 times faster than the naive method.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Hidden Markov model; likelihood; upper bound",
}

@Article{Zhang:2009:EAG,
  author =       "Xiang Zhang and Fei Zou and Wei Wang",
  title =        "Efficient algorithms for genome-wide association
                 study",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1631162.1631167",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Studying the association between quantitative
                 phenotype (such as height or weight) and single
                 nucleotide polymorphisms (SNPs) is an important problem
                 in biology. To understand underlying mechanisms of
                 complex phenotypes, it is often necessary to consider
                 joint genetic effects across multiple SNPs. ANOVA
                 (analysis of variance) test is routinely used in
                 association study. Important findings from studying
                 gene-gene (SNP-pair) interactions are appearing in the
                 literature. However, the number of SNPs can be up to
                 millions. Evaluating joint effects of SNPs is a
                 challenging task even for SNP-pairs. Moreover, with
                 large number of SNPs correlated, permutation procedure
                 is preferred over simple Bonferroni correction for
                 properly controlling family-wise error rate and
                 retaining mapping power, which dramatically increases
                 the computational cost of association study.\par

                 In this article, we study the problem of finding
                 SNP-pairs that have significant associations with a
                 given quantitative phenotype. We propose an efficient
                 algorithm, FastANOVA, for performing ANOVA tests on
                 SNP-pairs in a batch mode, which also supports large
                 permutation test. We derive an upper bound of SNP-pair
                 ANOVA test, which can be expressed as the sum of two
                 terms. The first term is based on single-SNP ANOVA
                 test. The second term is based on the SNPs and
                 independent of any phenotype permutation. Furthermore,
                 SNP-pairs can be organized into groups, each of which
                 shares a common upper bound. This allows for maximum
                 reuse of intermediate computation, efficient upper
                 bound estimation, and effective SNP-pair pruning.
                 Consequently, FastANOVA only needs to perform the ANOVA
                 test on a small number of candidate SNP-pairs without
                 the risk of missing any significant ones. Extensive
                 experiments demonstrate that FastANOVA is orders of
                 magnitude faster than the brute-force implementation of
                 ANOVA tests on all SNP pairs. The principles used in
                 FastANOVA can be applied to categorical phenotypes and
                 other statistics such as Chi-square test.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "ANOVA test; Association study; permutation test",
}

@Article{Bilgic:2009:RCM,
  author =       "Mustafa Bilgic and Lise Getoor",
  title =        "Reflect and correct: {A} misclassification prediction
                 approach to active inference",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "20:1--20:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1631162.1631168",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Information diffusion, viral marketing, graph-based
                 semi-supervised learning, and collective classification
                 all attempt to model and exploit the relationships
                 among nodes in a network to improve the performance of
                 node labeling algorithms. However, sometimes the
                 advantage of exploiting the relationships can become a
                 disadvantage. Simple models like label propagation and
                 iterative classification can aggravate a
                 misclassification by propagating mistakes in the
                 network, while more complex models that define and
                 optimize a global objective function, such as Markov
                 random fields and graph mincuts, can misclassify a set
                 of nodes jointly. This problem can be mitigated if the
                 classification system is allowed to ask for the correct
                 labels for a few of the nodes during inference.
                 However, determining the optimal set of labels to
                 acquire is intractable under relatively general
                 assumptions, which forces us to resort to approximate
                 and heuristic techniques. We describe three such
                 techniques in this article. The first one is based on
                 directly approximating the value of the objective
                 function of label acquisition and greedily acquiring
                 the label that provides the most improvement. The
                 second technique is a simple technique based on the
                 analogy we draw between viral marketing and label
                 acquisition. Finally, we propose a method, which we
                 refer to as {\em reflect and correct}, that can learn
                 and predict when the classification system is likely to
                 make mistakes and suggests acquisitions to correct
                 those mistakes. We empirically show on a variety of
                 synthetic and real-world datasets that the reflect and
                 correct method significantly outperforms the other two
                 techniques, as well as other approaches based on
                 network structural measures such as node degree and
                 network clustering.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Active inference; collective classification;
                 information diffusion; label acquisition; viral
                 marketing",
}

@Article{Kiernan:2009:CCS,
  author =       "Jerry Kiernan and Evimaria Terzi",
  title =        "Constructing comprehensive summaries of large event
                 sequences",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "21:1--21:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1631162.1631169",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Event sequences capture system and user activity over
                 time. Prior research on sequence mining has mostly
                 focused on discovering local patterns appearing in a
                 sequence. While interesting, these patterns do not give
                 a comprehensive summary of the entire event sequence.
                 Moreover, the number of patterns discovered can be
                 large. In this article, we take an alternative approach
                 and build {\em short\/} summaries that describe an
                 entire sequence, and discover local dependencies
                 between event types.\par

                 We formally define the summarization problem as an
                 optimization problem that balances shortness of the
                 summary with accuracy of the data description. We show
                 that this problem can be solved optimally in polynomial
                 time by using a combination of two dynamic-programming
                 algorithms. We also explore more efficient greedy
                 alternatives and demonstrate that they work well on
                 large datasets. Experiments on both synthetic and real
                 datasets illustrate that our algorithms are efficient
                 and produce high-quality results, and reveal
                 interesting local structures in the data.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Event sequences; log mining; summarization",
}

@Article{Koren:2010:FNS,
  author =       "Yehuda Koren",
  title =        "Factor in the neighbors: {Scalable} and accurate
                 collaborative filtering",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1644873.1644874",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recommender systems provide users with personalized
                 suggestions for products or services. These systems
                 often rely on collaborating filtering (CF), where past
                 transactions are analyzed in order to establish
                 connections between users and products. The most common
                 approach to CF is based on neighborhood models, which
                 originate from similarities between products or users.
                 In this work we introduce a new neighborhood model with
                 an improved prediction accuracy. Unlike previous
                 approaches that are based on heuristic similarities, we
                 model neighborhood relations by minimizing a global
                 cost function. Further accuracy improvements are
                 achieved by extending the model to exploit both
                 explicit and implicit feedback by the users. Past
                 models were limited by the need to compute all pairwise
                 similarities between items or users, which grow
                 quadratically with input size. In particular, this
                 limitation vastly complicates adopting user similarity
                 models, due to the typical large number of users. Our
                 new model solves these limitations by factoring the
                 neighborhood model, thus making both item-item and
                 user-user implementations scale linearly with the size
                 of the data. The methods are tested on the Netflix
                 data, with encouraging results.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "collaborative filtering; Netflix Prize; Recommender
                 systems",
}

@Article{Syed:2010:MDP,
  author =       "Zeeshan Syed and Collin Stultz and Manolis Kellis and
                 Piotr Indyk and John Guttag",
  title =        "Motif discovery in physiological datasets: {A}
                 methodology for inferring predictive elements",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1644873.1644875",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article, we propose a methodology for
                 identifying predictive physiological patterns in the
                 absence of prior knowledge. We use the principle of
                 conservation to identify activity that consistently
                 precedes an outcome in patients, and describe a
                 two-stage process that allows us to efficiently search
                 for such patterns in large datasets. This involves
                 first transforming continuous physiological signals
                 from patients into symbolic sequences, and then
                 searching for patterns in these reduced representations
                 that are strongly associated with an outcome.\par

                 Our strategy of identifying conserved activity that is
                 unlikely to have occurred purely by chance in symbolic
                 data is analogous to the discovery of regulatory motifs
                 in genomic datasets. We build upon existing work in
                 this area, generalizing the notion of a regulatory
                 motif and enhancing current techniques to operate
                 robustly on non-genomic data. We also address two
                 significant considerations associated with motif
                 discovery in general: computational efficiency and
                 robustness in the presence of degeneracy and noise. To
                 deal with these issues, we introduce the concept of
                 active regions and new subset-based techniques such as
                 a two-layer Gibbs sampling algorithm. These extensions
                 allow for a framework for information inference, where
                 precursors are identified as approximately conserved
                 activity of arbitrary complexity preceding multiple
                 occurrences of an event.\par

                 We evaluated our solution on a population of patients
                 who experienced sudden cardiac death and attempted to
                 discover electrocardiographic activity that may be
                 associated with the endpoint of death. To assess the
                 predictive patterns discovered, we compared likelihood
                 scores for motifs in the sudden death population
                 against control populations of normal individuals and
                 those with non-fatal supraventricular arrhythmias. Our
                 results suggest that predictive motif discovery may be
                 able to identify clinically relevant information even
                 in the absence of significant prior knowledge.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "data mining; Gibbs sampling; inference; knowledge
                 discovery; motifs; physiological signals",
}

@Article{Webb:2010:SSI,
  author =       "Geoffrey I. Webb",
  title =        "Self-sufficient itemsets: {An} approach to screening
                 potentially interesting associations between items",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1644873.1644876",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Self-sufficient itemsets are those whose frequency
                 cannot be explained solely by the frequency of either
                 their subsets or of their supersets. We argue that
                 itemsets that are not self-sufficient will often be of
                 little interest to the data analyst, as their frequency
                 should be expected once that of the itemsets on which
                 their frequency depends is known. We present tests for
                 statistically sound discovery of self-sufficient
                 itemsets, and computational techniques that allow those
                 tests to be applied as a post-processing step for any
                 itemset discovery algorithm. We also present a measure
                 for assessing the degree of potential interest in an
                 itemset that complements these statistical measures.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Association discovery; association rules; itemset
                 discovery; itemset screening; statistical evaluation",
}

@Article{Plantevit:2010:MMM,
  author =       "Marc Plantevit and Anne Laurent and Dominique Laurent
                 and Maguelonne Teisseire and Yeow Wei Choong",
  title =        "Mining multidimensional and multilevel sequential
                 patterns",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1644873.1644877",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multidimensional databases have been designed to
                 provide decision makers with the necessary tools to
                 help them understand their data. This framework is
                 different from transactional data as the datasets
                 contain huge volumes of historicized and aggregated
                 data defined over a set of dimensions that can be
                 arranged through multiple levels of granularities. Many
                 tools have been proposed to query the data and navigate
                 through the levels of granularity. However, automatic
                 tools are still missing to mine this type of data in
                 order to discover regular specific patterns. In this
                 article, we present a method for mining sequential
                 patterns from multidimensional databases, at the same
                 time taking advantage of the different dimensions and
                 levels of granularity, which is original compared to
                 existing work. The necessary definitions and algorithms
                 are extended from regular sequential patterns to this
                 particular case. Experiments are reported, showing the
                 significance of this approach.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "frequent patterns; hierarchy; multidimensional
                 databases; multilevel patterns; Sequential patterns",
}

@Article{Zaki:2010:VVO,
  author =       "Mohammed J. Zaki and Christopher D. Carothers and
                 Boleslaw K. Szymanski",
  title =        "{VOGUE}: {A} variable order hidden {Markov} model with
                 duration based on frequent sequence mining",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1644873.1644878",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present VOGUE, a novel, variable order hidden
                 Markov model with state durations, that combines two
                 separate techniques for modeling complex patterns in
                 sequential data: pattern mining and data modeling.
                 VOGUE relies on a variable gap sequence mining method
                 to extract frequent patterns with different lengths and
                 gaps between elements. It then uses these mined
                 sequences to build a variable order hidden Markov model
                 (HMM), that explicitly models the gaps. The gaps
                 implicitly model the order of the HMM, and they
                 explicitly model the duration of each state. We apply
                 VOGUE to a variety of real sequence data taken from
                 domains such as protein sequence classification, Web
                 usage logs, intrusion detection, and spelling
                 correction. We show that VOGUE has superior
                 classification accuracy compared to regular HMMs,
                 higher-order HMMs, and even special purpose HMMs like
                 HMMER, which is a state-of-the-art method for protein
                 classification. The VOGUE implementation and the
                 datasets used in this article are available as
                 open-source.$^1$",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "Hidden Markov models; higher-order HMM; HMM with
                 duration; sequence mining and modeling; variable-order
                 HMM",
}

@Article{Vadera:2010:CCS,
  author =       "Sunil Vadera",
  title =        "{CSNL}: {A} cost-sensitive non-linear decision tree
                 algorithm",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754428.1754429",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article presents a new decision tree learning
                 algorithm called CSNL that induces Cost-Sensitive
                 Non-Linear decision trees. The algorithm is based on
                 the hypothesis that nonlinear decision nodes provide a
                 better basis than axis-parallel decision nodes and
                 utilizes discriminant analysis to construct nonlinear
                 decision trees that take account of costs of
                 misclassification.\par

                 The performance of the algorithm is evaluated by
                 applying it to seventeen datasets and the results are
                 compared with those obtained by two well known
                 cost-sensitive algorithms, ICET and MetaCost, which
                 generate multiple trees to obtain some of the best
                 results to date. The results show that CSNL performs at
                 least as well, if not better than these algorithms, in
                 more than twelve of the datasets and is considerably
                 faster. The use of bagging with CSNL further enhances
                 its performance showing the significant benefits of
                 using nonlinear decision nodes.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "cost-sensitive learning; Decision tree learning",
}

@Article{Kandylas:2010:AKC,
  author =       "Vasileios Kandylas and S. Phineas Upham and Lyle H.
                 Ungar",
  title =        "Analyzing knowledge communities using foreground and
                 background clusters",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754428.1754430",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Insight into the growth (or shrinkage) of ``knowledge
                 communities'' of authors that build on each other's
                 work can be gained by studying the evolution over time
                 of clusters of documents. We cluster documents based on
                 the documents they cite in common using the Streemer
                 clustering method, which finds cohesive foreground
                 clusters (the knowledge communities) embedded in a
                 diffuse background. We build predictive models with
                 features based on the citation structure, the
                 vocabulary of the papers, and the affiliations and
                 prestige of the authors and use these models to study
                 the drivers of community growth and the predictors of
                 how widely a paper will be cited. We find that
                 scientific knowledge communities tend to grow more
                 rapidly if their publications build on diverse
                 information and use narrow vocabulary and that papers
                 that lie on the periphery of a community have the
                 highest impact, while those not in any community have
                 the lowest impact.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "citation analysis; clustering; community evolution;
                 knowledge communities; Text mining",
}

@Article{Ji:2010:SSL,
  author =       "Shuiwang Ji and Lei Tang and Shipeng Yu and Jieping
                 Ye",
  title =        "A shared-subspace learning framework for multi-label
                 classification",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754428.1754431",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multi-label problems arise in various domains such as
                 multi-topic document categorization, protein function
                 prediction, and automatic image annotation. One natural
                 way to deal with such problems is to construct a binary
                 classifier for each label, resulting in a set of
                 independent binary classification problems. Since
                 multiple labels share the same input space, and the
                 semantics conveyed by different labels are usually
                 correlated, it is essential to exploit the correlation
                 information contained in different labels. In this
                 paper, we consider a general framework for extracting
                 shared structures in multi-label classification. In
                 this framework, a common subspace is assumed to be
                 shared among multiple labels. We show that the optimal
                 solution to the proposed formulation can be obtained by
                 solving a generalized eigenvalue problem, though the
                 problem is nonconvex. For high-dimensional problems,
                 direct computation of the solution is expensive, and we
                 develop an efficient algorithm for this case. One
                 appealing feature of the proposed framework is that it
                 includes several well-known algorithms as special
                 cases, thus elucidating their intrinsic relationships.
                 We further show that the proposed framework can be
                 extended to the kernel-induced feature space. We have
                 conducted extensive experiments on multi-topic web page
                 categorization and automatic gene expression pattern
                 image annotation tasks, and results demonstrate the
                 effectiveness of the proposed formulation in comparison
                 with several representative algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "gene expression pattern image annotation; kernel
                 methods; least squares loss; Multi-label
                 classification; shared subspace; singular value
                 decomposition; web page categorization",
}

@Article{Ruggieri:2010:DMD,
  author =       "Salvatore Ruggieri and Dino Pedreschi and Franco
                 Turini",
  title =        "Data mining for discrimination discovery",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754428.1754432",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In the context of civil rights law, discrimination
                 refers to unfair or unequal treatment of people based
                 on membership to a category or a minority, without
                 regard to individual merit. Discrimination in credit,
                 mortgage, insurance, labor market, and education has
                 been investigated by researchers in economics and human
                 sciences. With the advent of automatic decision support
                 systems, such as credit scoring systems, the ease of
                 data collection opens several challenges to data
                 analysts for the fight against discrimination. In this
                 article, we introduce the problem of discovering
                 discrimination through data mining in a dataset of
                 historical decision records, taken by humans or by
                 automatic systems. We formalize the processes of direct
                 and indirect discrimination discovery by modelling
                 protected-by-law groups and contexts where
                 discrimination occurs in a classification rule based
                 syntax. Basically, classification rules extracted from
                 the dataset allow for unveiling contexts of unlawful
                 discrimination, where the degree of burden over
                 protected-by-law groups is formalized by an extension
                 of the lift measure of a classification rule. In direct
                 discrimination, the extracted rules can be directly
                 mined in search of discriminatory contexts. In indirect
                 discrimination, the mining process needs some
                 background knowledge as a further input, for example,
                 census data, that combined with the extracted rules
                 might allow for unveiling contexts of discriminatory
                 decisions. A strategy adopted for combining extracted
                 classification rules with background knowledge is
                 called an inference model. In this article, we propose
                 two inference models and provide automatic procedures
                 for their implementation. An empirical assessment of
                 our results is provided on the German credit dataset
                 and on the PKDD Discovery Challenge 1999 financial
                 dataset.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
  keywords =     "classification rules; Discrimination",
}

@Article{Thomas:2010:MMF,
  author =       "Lini T. Thomas and Satyanarayana R. Valluri and
                 Kamalakar Karlapalem",
  title =        "{MARGIN}: {Maximal} frequent subgraph mining",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1839490.1839491",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Deodhar:2010:SFS,
  author =       "Meghana Deodhar and Joydeep Ghosh",
  title =        "{SCOAL}: {A} framework for simultaneous co-clustering
                 and learning from complex data",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1839490.1839492",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chen:2010:BBI,
  author =       "Jinlin Chen and Keli Xiao",
  title =        "{BISC}: {A} bitmap itemset support counting approach
                 for efficient frequent itemset mining",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1839490.1839493",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Becchetti:2010:EAL,
  author =       "Luca Becchetti and Paolo Boldi and Carlos Castillo and
                 Aristides Gionis",
  title =        "Efficient algorithms for large-scale local triangle
                 counting",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1839490.1839494",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Zhang:2010:MDR,
  author =       "Yin Zhang and Zhi-Hua Zhou",
  title =        "Multilabel dimensionality reduction via dependence
                 maximization",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1839490.1839495",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Cui:2010:LMN,
  author =       "Ying Cui and Xiaoli Z. Fern and Jennifer G. Dy",
  title =        "Learning multiple nonredundant clusterings",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1839490.1839496",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2010:TSI,
  author =       "Wei Wang",
  title =        "{TKDD} Special Issue: {SIGKDD 2009}",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857947.1857948",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chen:2010:BTA,
  author =       "Ye Chen and Dmitry Pavlov and John F. Canny",
  title =        "Behavioral Targeting: The Art of Scaling Up Simple
                 Algorithms",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857947.1857949",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Mohammed:2010:CDA,
  author =       "Noman Mohammed and Benjamin C. M. Fung and Patrick C.
                 K. Hung and Cheuk-Kwong Lee",
  title =        "Centralized and Distributed Anonymization for
                 High-Dimensional Healthcare Data",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857947.1857950",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Liu:2010:BBM,
  author =       "Chao Liu and Fan Guo and Christos Faloutsos",
  title =        "{Bayesian} Browsing Model: Exact Inference of Document
                 Relevance from Petabyte-Scale Data",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857947.1857951",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wu:2010:MAF,
  author =       "Mingxi Wu and Chris Jermaine and Sanjay Ranka and
                 Xiuyao Song and John Gums",
  title =        "A Model-Agnostic Framework for Fast Spatial Anomaly
                 Detection",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857947.1857952",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Zhong:2010:ATS,
  author =       "Ning Zhong and Gregory Piatetsky-Shapiro and Yiyu Yao
                 and Philip S. Yu",
  title =        "{ACM TKDD} Special Issue on Knowledge Discovery for
                 {Web} Intelligence",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1870096.1870097",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Tang:2010:CAW,
  author =       "Jie Tang and Limin Yao and Duo Zhang and Jing Zhang",
  title =        "A Combination Approach to {Web} User Profiling",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1870096.1870098",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Bouguessa:2010:DKS,
  author =       "Mohamed Bouguessa and Shengrui Wang and Benoit
                 Dumoulin",
  title =        "Discovering Knowledge-Sharing Communities in
                 Question-Answering Forums",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1870096.1870099",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Plangprasopchok:2010:MSA,
  author =       "Anon Plangprasopchok and Kristina Lerman",
  title =        "Modeling Social Annotation: {A} {Bayesian} Approach",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1870096.1870100",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Sakurai:2010:FDG,
  author =       "Yasushi Sakurai and Christos Faloutsos and Spiros
                 Papadimitriou",
  title =        "Fast Discovery of Group Lag Correlations in Streams",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1870096.1870101",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Liu:2010:FCP,
  author =       "Kun Liu and Evimaria Terzi",
  title =        "A Framework for Computing the Privacy Scores of Users
                 in Online Social Networks",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1870096.1870102",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Sun:2011:ISI,
  author =       "Jimeng Sun and Yan Liu and Jie Tang and Chid Apte",
  title =        "Introduction to Special Issue on Large-Scale Data
                 Mining",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1921632.1921633",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Kang:2011:HMR,
  author =       "U. Kang and Charalampos E. Tsourakakis and Ana Paula
                 Appel and Christos Faloutsos and Jure Leskovec",
  title =        "{HADI}: Mining Radii of Large Graphs",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1921632.1921634",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{deVries:2011:RRL,
  author =       "Timothy de Vries and Hui Ke and Sanjay Chawla and
                 Peter Christen",
  title =        "Robust Record Linkage Blocking Using Suffix Arrays and
                 {Bloom} Filters",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1921632.1921635",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Dunlavy:2011:TLP,
  author =       "Daniel M. Dunlavy and Tamara G. Kolda and Evrim Acar",
  title =        "Temporal Link Prediction Using Matrix and Tensor
                 Factorizations",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1921632.1921636",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Magdalinos:2011:ECQ,
  author =       "Panagis Magdalinos and Christos Doulkeridis and
                 Michalis Vazirgiannis",
  title =        "Enhancing Clustering Quality through Landmark-Based
                 Dimensionality Reduction",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1921632.1921637",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Cheng:2011:CLA,
  author =       "Hong Cheng and Yang Zhou and Jeffrey Xu Yu",
  title =        "Clustering Large Attributed Graphs: {A} Balance
                 between Structural and Attribute Similarities",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1921632.1921638",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Menon:2011:FAA,
  author =       "Aditya Krishna Menon and Charles Elkan",
  title =        "Fast Algorithms for Approximating the Singular Value
                 Decomposition",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1921632.1921639",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A low-rank approximation to a matrix $A$ is a matrix
                 with significantly smaller rank than $A$, and which is
                 close to $A$ according to some norm. Many practical
                 applications involving the use of large matrices focus
                 on low-rank approximations. By reducing the rank or
                 dimensionality of the data, we reduce the complexity of
                 analyzing the data. The singular value decomposition is
                 the most popular low-rank matrix approximation.
                 However, due to its expensive computational
                 requirements, it has often been considered intractable
                 for practical applications involving massive data.
                 Recent developments have tried to address this problem,
                 with several methods proposed to approximate the
                 decomposition with better asymptotic runtime. We
                 present an empirical study of these techniques on a
                 variety of dense and sparse datasets. We find that a
                 sampling approach of Drineas, Kannan and Mahoney is
                 often, but not always, the best performing method. This
                 method gives solutions with high accuracy much faster
                 than classical SVD algorithms, on large sparse datasets
                 in particular. Other modern methods, such as a recent
                 algorithm by Rokhlin and Tygert, also offer savings
                 compared to classical SVD algorithms. The older
                 sampling methods of Achlioptas and McSherry are shown
                 to sometimes take longer than classical SVD.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2011:IDC,
  author =       "Dingding Wang and Shenghuo Zhu and Tao Li and Yun Chi
                 and Yihong Gong",
  title =        "Integrating Document Clustering and Multidocument
                 Summarization",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1993077.1993078",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Maier:2011:INS,
  author =       "Marc Maier and Matthew Rattigan and David Jensen",
  title =        "Indexing Network Structure with Shortest-Path Trees",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1993077.1993079",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wong:2011:CUA,
  author =       "Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Ke Wang
                 and Philip S. Yu and Jian Pei",
  title =        "Can the Utility of Anonymized Data be Used for Privacy
                 Breaches?",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1993077.1993080",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Lin:2011:CDM,
  author =       "Yu-Ru Lin and Jimeng Sun and Hari Sundaram and Aisling
                 Kelliher and Paul Castro and Ravi Konuru",
  title =        "Community Discovery via Metagraph Factorization",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1993077.1993081",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Elkan:2012:GES,
  author =       "Charles Elkan and Yehuda Koren",
  title =        "Guest Editorial for Special Issue {KDD'10}",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "18:1--18:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086737.2086738",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Iwata:2012:SMT,
  author =       "Tomoharu Iwata and Takeshi Yamada and Yasushi Sakurai
                 and Naonori Ueda",
  title =        "Sequential Modeling of Topic Dynamics with Multiple
                 Timescales",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "19:1--19:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086737.2086739",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We propose an online topic model for sequentially
                 analyzing the time evolution of topics in document
                 collections. Topics naturally evolve with multiple
                 timescales. For example, some words may be used
                 consistently over one hundred years, while other words
                 emerge and disappear over periods of a few days. Thus,
                 in the proposed model, current topic-specific
                 distributions over words are assumed to be generated
                 based on the multiscale word distributions of the
                 previous epoch. Considering both the long- and
                 short-timescale dependency yields a more robust model.
                 We derive efficient online inference procedures based
                 on a stochastic EM algorithm, in which the model is
                 sequentially updated using newly obtained data; this
                 means that past data are not required to make the
                 inference. We demonstrate the effectiveness of the
                 proposed method in terms of predictive performance and
                 computational efficiency by examining collections of
                 real documents with timestamps.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Huh:2012:DTM,
  author =       "Seungil Huh and Stephen E. Fienberg",
  title =        "Discriminative Topic Modeling Based on Manifold
                 Learning",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "20:1--20:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086737.2086740",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Topic modeling has become a popular method used for
                 data analysis in various domains including text
                 documents. Previous topic model approaches, such as
                 probabilistic Latent Semantic Analysis (pLSA) and
                 Latent Dirichlet Allocation (LDA), have shown
                 impressive success in discovering low-rank hidden
                 structures for modeling text documents. These
                 approaches, however do not take into account the
                 manifold structure of the data, which is generally
                 informative for nonlinear dimensionality reduction
                 mapping. More recent topic model approaches, Laplacian
                 PLSI (LapPLSI) and Locally-consistent Topic Model
                 (LTM), have incorporated the local manifold structure
                 into topic models and have shown resulting benefits.
                 But they fall short of achieving full discriminating
                 power of manifold learning as they only enhance the
                 proximity between the low-rank representations of
                 neighboring pairs without any consideration for
                 non-neighboring pairs. In this article, we propose a
                 new approach, Discriminative Topic Model (DTM), which
                 separates non-neighboring pairs from each other in
                 addition to bringing neighboring pairs closer together,
                 thereby preserving the global manifold structure as
                 well as improving local consistency. We also present a
                 novel model-fitting algorithm based on the generalized
                 EM algorithm and the concept of Pareto improvement. We
                 empirically demonstrate the success of DTM in terms of
                 unsupervised clustering and semisupervised
                 classification accuracies on text corpora and
                 robustness to parameters compared to state-of-the-art
                 techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Gomez-Rodriguez:2012:IND,
  author =       "Manuel Gomez-Rodriguez and Jure Leskovec and Andreas
                 Krause",
  title =        "Inferring Networks of Diffusion and Influence",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "21:1--21:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086737.2086741",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Information diffusion and virus propagation are
                 fundamental processes taking place in networks. While
                 it is often possible to directly observe when nodes
                 become infected with a virus or publish the
                 information, observing individual transmissions (who
                 infects whom, or who influences whom) is typically very
                 difficult. Furthermore, in many applications, the
                 underlying network over which the diffusions and
                 propagations spread is actually unobserved. We tackle
                 these challenges by developing a method for tracing
                 paths of diffusion and influence through networks and
                 inferring the networks over which contagions propagate.
                 Given the times when nodes adopt pieces of information
                 or become infected, we identify the optimal network
                 that best explains the observed infection times. Since
                 the optimization problem is NP-hard to solve exactly,
                 we develop an efficient approximation algorithm that
                 scales to large datasets and finds provably
                 near-optimal networks. We demonstrate the effectiveness
                 of our approach by tracing information diffusion in a
                 set of 170 million blogs and news articles over a one
                 year period to infer how information flows through the
                 online media space. We find that the diffusion network
                 of news for the top 1,000 media sites and blogs tends
                 to have a core-periphery structure with a small set of
                 core media sites that diffuse information to the rest
                 of the Web. These sites tend to have stable circles of
                 influence with more general news media sites acting as
                 connectors between them.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chen:2012:LIS,
  author =       "Jianhui Chen and Ji Liu and Jieping Ye",
  title =        "Learning Incoherent Sparse and Low-Rank Patterns from
                 Multiple Tasks",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "22:1--22:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086737.2086742",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We consider the problem of learning incoherent sparse
                 and low-rank patterns from multiple tasks. Our approach
                 is based on a linear multitask learning formulation, in
                 which the sparse and low-rank patterns are induced by a
                 cardinality regularization term and a low-rank
                 constraint, respectively. This formulation is
                 nonconvex; we convert it into its convex surrogate,
                 which can be routinely solved via semidefinite
                 programming for small-size problems. We propose
                 employing the general projected gradient scheme to
                 efficiently solve such a convex surrogate; however, in
                 the optimization formulation, the objective function is
                 nondifferentiable and the feasible domain is
                 nontrivial. We present the procedures for computing the
                 projected gradient and ensuring the global convergence
                 of the projected gradient scheme. The computation of
                 the projected gradient involves a constrained
                 optimization problem; we show that the optimal solution
                 to such a problem can be obtained via solving an
                 unconstrained optimization subproblem and a Euclidean
                 projection subproblem. We also present two projected
                 gradient algorithms and analyze their rates of
                 convergence in detail. In addition, we illustrate the
                 use of the presented projected gradient algorithms for
                 the proposed multitask learning formulation using the
                 least squares loss. Experimental results on a
                 collection of real-world data sets demonstrate the
                 effectiveness of the proposed multitask learning
                 formulation and the efficiency of the proposed
                 projected gradient algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Yu:2012:LLC,
  author =       "Hsiang-Fu Yu and Cho-Jui Hsieh and Kai-Wei Chang and
                 Chih-Jen Lin",
  title =        "Large Linear Classification When Data Cannot Fit in
                 Memory",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "23:1--23:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086737.2086743",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recent advances in linear classification have shown
                 that for applications such as document classification,
                 the training process can be extremely efficient.
                 However, most of the existing training methods are
                 designed by assuming that data can be stored in the
                 computer memory. These methods cannot be easily applied
                 to data larger than the memory capacity due to the
                 random access to the disk. We propose and analyze a
                 block minimization framework for data larger than the
                 memory size. At each step a block of data is loaded
                 from the disk and handled by certain learning methods.
                 We investigate two implementations of the proposed
                 framework for primal and dual SVMs, respectively.
                 Because data cannot fit in memory, many design
                 considerations are very different from those for
                 traditional algorithms. We discuss and compare with
                 existing approaches that are able to handle data larger
                 than memory. Experiments using data sets 20 times
                 larger than the memory demonstrate the effectiveness of
                 the proposed method.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Shahaf:2012:CTL,
  author =       "Dafna Shahaf and Carlos Guestrin",
  title =        "Connecting Two (or Less) Dots: Discovering Structure
                 in News Articles",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "24:1--24:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086737.2086744",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Finding information is becoming a major part of our
                 daily life. Entire sectors, from Web users to
                 scientists and intelligence analysts, are increasingly
                 struggling to keep up with the larger and larger
                 amounts of content published every day. With this much
                 data, it is often easy to miss the big picture. In this
                 article, we investigate methods for automatically
                 connecting the dots---providing a structured, easy way
                 to navigate within a new topic and discover hidden
                 connections. We focus on the news domain: given two
                 news articles, our system automatically finds a
                 coherent chain linking them together. For example, it
                 can recover the chain of events starting with the
                 decline of home prices (January 2007), and ending with
                 the health care debate (2009). We formalize the
                 characteristics of a good chain and provide a fast
                 search-driven algorithm to connect two fixed endpoints.
                 We incorporate user feedback into our framework,
                 allowing the stories to be refined and personalized. We
                 also provide a method to handle partially-specified
                 endpoints, for users who do not know both ends of a
                 story. Finally, we evaluate our algorithm over real
                 news data. Our user studies demonstrate that the
                 objective we propose captures the users' intuitive
                 notion of coherence, and that our algorithm effectively
                 helps users understand the news.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Ienco:2012:CDL,
  author =       "Dino Ienco and Ruggero G. Pensa and Rosa Meo",
  title =        "From Context to Distance: Learning Dissimilarity for
                 Categorical Data Clustering",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133360.2133361",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering data described by categorical attributes is
                 a challenging task in data mining applications. Unlike
                 numerical attributes, it is difficult to define a
                 distance between pairs of values of a categorical
                 attribute, since the values are not ordered. In this
                 article, we propose a framework to learn a
                 context-based distance for categorical attributes. The
                 key intuition of this work is that the distance between
                 two values of a categorical attribute A$_i$ can be
                 determined by the way in which the values of the other
                 attributes A$_j$ are distributed in the dataset
                 objects: if they are similarly distributed in the
                 groups of objects in correspondence of the distinct
                 values of A$_i$ a low value of distance is obtained. We
                 propose also a solution to the critical point of the
                 choice of the attributes A$_j$. We validate our
                 approach by embedding our distance learning framework
                 in a hierarchical clustering algorithm. We applied it
                 on various real world and synthetic datasets, both low
                 and high-dimensional. Experimental results show that
                 our method is competitive with respect to the state of
                 the art of categorical data clustering approaches. We
                 also show that our approach is scalable and has a low
                 impact on the overall computational time of a
                 clustering task.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Li:2012:EMG,
  author =       "Chun Li and Qingyan Yang and Jianyong Wang and Ming
                 Li",
  title =        "Efficient Mining of Gap-Constrained Subsequences and
                 Its Various Applications",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133360.2133362",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Mining frequent subsequence patterns is a typical
                 data-mining problem and various efficient sequential
                 pattern mining algorithms have been proposed. In many
                 application domains (e.g., biology), the frequent
                 subsequences confined by the predefined gap
                 requirements are more meaningful than the general
                 sequential patterns. In this article, we propose two
                 algorithms, Gap-BIDE for mining closed gap-constrained
                 subsequences from a set of input sequences, and
                 Gap-Connect for mining repetitive gap-constrained
                 subsequences from a single input sequence. Inspired by
                 some state-of-the-art closed or constrained sequential
                 pattern mining algorithms, the Gap-BIDE algorithm
                 adopts an efficient approach to finding the complete
                 set of closed sequential patterns with gap constraints,
                 while the Gap-Connect algorithm efficiently mines an
                 approximate set of long patterns by connecting short
                 patterns. We also present several methods for feature
                 selection from the set of gap-constrained patterns for
                 the purpose of classification and clustering. Our
                 extensive performance study shows that our approaches
                 are very efficient in mining frequent subsequences with
                 gap constraints, and the gap-constrained pattern based
                 classification/clustering approaches can achieve
                 high-quality results.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Liu:2012:IBA,
  author =       "Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou",
  title =        "Isolation-Based Anomaly Detection",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133360.2133363",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Anomalies are data points that are few and different.
                 As a result of these properties, we show that,
                 anomalies are susceptible to a mechanism called
                 isolation. This article proposes a method called
                 Isolation Forest ($i$ Forest), which detects anomalies
                 purely based on the concept of isolation without
                 employing any distance or density
                 measure---fundamentally different from all existing
                 methods. As a result, $i$ Forest is able to exploit
                 subsampling (i) to achieve a low linear time-complexity
                 and a small memory-requirement and (ii) to deal with
                 the effects of swamping and masking effectively. Our
                 empirical evaluation shows that $i$ Forest outperforms
                 ORCA, one-class SVM, LOF and Random Forests in terms of
                 AUC, processing time, and it is robust against masking
                 and swamping effects. $i$ Forest also works well in
                 high dimensional problems containing a large number of
                 irrelevant attributes, and when anomalies are not
                 available in training sample.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Jin:2012:MML,
  author =       "Yu Jin and Nick Duffield and Jeffrey Erman and Patrick
                 Haffner and Subhabrata Sen and Zhi-Li Zhang",
  title =        "A Modular Machine Learning System for Flow-Level
                 Traffic Classification in Large Networks",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133360.2133364",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The ability to accurately and scalably classify
                 network traffic is of critical importance to a wide
                 range of management tasks of large networks, such as
                 tier-1 ISP networks and global enterprise networks.
                 Guided by the practical constraints and requirements of
                 traffic classification in large networks, in this
                 article, we explore the design of an accurate and
                 scalable machine learning based flow-level traffic
                 classification system, which is trained on a dataset of
                 flow-level data that has been annotated with
                 application protocol labels by a packet-level
                 classifier. Our system employs a lightweight modular
                 architecture, which combines a series of simple linear
                 binary classifiers, each of which can be efficiently
                 implemented and trained on vast amounts of flow data in
                 parallel, and embraces three key innovative mechanisms,
                 weighted threshold sampling, logistic calibration, and
                 intelligent data partitioning, to achieve scalability
                 while attaining high accuracy. Evaluations using real
                 traffic data from multiple locations in a large ISP
                 show that our system accurately reproduces the labels
                 of the packet level classifier when runs on (unlabeled)
                 flow records, while meeting the scalability and
                 stability requirements of large ISP networks. Using
                 training and test datasets that are two months apart
                 and collected from two different locations, the flow
                 error rates are only 3\% for TCP flows and 0.4\% for
                 UDP flows. We further show that such error rates can be
                 reduced by combining the information of spatial
                 distributions of flows, or collective traffic
                 statistics, during classification. We propose a novel
                 two-step model, which seamlessly integrates these
                 collective traffic statistics into the existing traffic
                 classification system. Experimental results display
                 performance improvement on all traffic classes and an
                 overall error rate reduction by 15\%. In addition to a
                 high accuracy, at runtime, our implementation easily
                 scales to classify traffic on 10Gbps links.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Mavroeidis:2012:SSF,
  author =       "Dimitrios Mavroeidis and Panagis Magdalinos",
  title =        "A Sequential Sampling Framework for Spectral $k$-Means
                 Based on Efficient Bootstrap Accuracy Estimations:
                 Application to Distributed Clustering",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2297456.2297457",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The scalability of learning algorithms has always been
                 a central concern for data mining researchers, and
                 nowadays, with the rapid increase in data storage
                 capacities and availability, its importance has
                 increased. To this end, sampling has been studied by
                 several researchers in an effort to derive sufficiently
                 accurate models using only small data fractions. In
                 this article we focus on spectral $k$-means, that is,
                 the $k$-means approximation as derived by the spectral
                 relaxation, and propose a sequential sampling framework
                 that iteratively enlarges the sample size until the
                 $k$-means results (objective function and cluster
                 structure) become indistinguishable from the asymptotic
                 (infinite-data) output. In the proposed framework we
                 adopt a commonly applied principle in data mining
                 research that considers the use of minimal assumptions
                 concerning the data generating distribution. This
                 restriction imposes several challenges, mainly related
                 to the efficiency of the sequential sampling procedure.
                 These challenges are addressed using elements of matrix
                 perturbation theory and statistics. Moreover, although
                 the main focus is on spectral $k$-means, we also
                 demonstrate that the proposed framework can be
                 generalized to handle spectral clustering. The proposed
                 sequential sampling framework is consecutively employed
                 for addressing the distributed clustering problem,
                 where the task is to construct a global model for data
                 that resides in distributed network nodes. The main
                 challenge in this context is related to the bandwidth
                 constraints that are commonly imposed, thus requiring
                 that the distributed clustering algorithm consumes a
                 minimal amount of network load. This illustrates the
                 applicability of the proposed approach, as it enables
                 the determination of a minimal sample size that can be
                 used for constructing an accurate clustering model that
                 entails the distributional characteristics of the data.
                 As opposed to the relevant distributed $k$-means
                 approaches, our framework takes into account the fact
                 that the choice of the number of clusters has a crucial
                 effect on the required amount of communication. More
                 precisely, the proposed algorithm is able to derive a
                 statistical estimation of the required relative sizes
                 for all possible values of $k$. This unique feature of
                 our distributed clustering framework enables a network
                 administrator to choose an economic solution that
                 identifies the crude cluster structure of a dataset and
                 not devote excessive network resources for identifying
                 all the ``correct'' detailed clusters.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Das:2012:MIG,
  author =       "Sanmay Das and Malik Magdon-Ismail",
  title =        "A Model for Information Growth in Collective Wisdom
                 Processes",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2297456.2297458",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Collaborative media such as wikis have become
                 enormously successful venues for information creation.
                 Articles accrue information through the asynchronous
                 editing of users who arrive both seeking information
                 and possibly able to contribute information. Most
                 articles stabilize to high-quality, trusted sources of
                 information representing the collective wisdom of all
                 the users who edited the article. We propose a model
                 for information growth which relies on two main
                 observations: (i) as an article's quality improves, it
                 attracts visitors at a faster rate (a rich-get-richer
                 phenomenon); and, simultaneously, (ii) the chances that
                 a new visitor will improve the article drops (there is
                 only so much that can be said about a particular
                 topic). Our model is able to reproduce many features of
                 the edit dynamics observed on Wikipedia; in particular,
                 it captures the observed rise in the edit rate,
                 followed by $1/ t$ decay. Despite differences in the
                 media, we also document similar features in the comment
                 rates for a segment of the LiveJournal blogosphere.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Xu:2012:GME,
  author =       "Tianbing Xu and Zhongfei Zhang and Philip S. Yu and Bo
                 Long",
  title =        "Generative Models for Evolutionary Clustering",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2297456.2297459",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article studies evolutionary clustering, a
                 recently emerged hot topic with many important
                 applications, noticeably in dynamic social network
                 analysis. In this article, based on the recent
                 literature on nonparametric Bayesian models, we have
                 developed two generative models: DPChain and HDP-HTM.
                 DPChain is derived from the Dirichlet process mixture
                 (DPM) model, with an exponential decaying component
                 along with the time. HDP-HTM combines the hierarchical
                 dirichlet process (HDP) with a hierarchical transition
                 matrix (HTM) based on the proposed Infinite
                 hierarchical Markov state model (iHMS). Both models
                 substantially advance the literature on evolutionary
                 clustering, in the sense that not only do they both
                 perform better than those in the existing literature,
                 but more importantly, they are capable of automatically
                 learning the cluster numbers and explicitly addressing
                 the corresponding issues. Extensive evaluations have
                 demonstrated the effectiveness and the promise of these
                 two solutions compared to the state-of-the-art
                 literature.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2012:LME,
  author =       "Shaojun Wang and Dale Schuurmans and Yunxin Zhao",
  title =        "The Latent Maximum Entropy Principle",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2297456.2297460",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present an extension to Jaynes' maximum entropy
                 principle that incorporates latent variables. The
                 principle of latent maximum entropy we propose is
                 different from both Jaynes' maximum entropy principle
                 and maximum likelihood estimation, but can yield better
                 estimates in the presence of hidden variables and
                 limited training data. We first show that solving for a
                 latent maximum entropy model poses a hard nonlinear
                 constrained optimization problem in general. However,
                 we then show that feasible solutions to this problem
                 can be obtained efficiently for the special case of
                 log-linear models---which forms the basis for an
                 efficient approximation to the latent maximum entropy
                 principle. We derive an algorithm that combines
                 expectation-maximization with iterative scaling to
                 produce feasible log-linear solutions. This algorithm
                 can be interpreted as an alternating minimization
                 algorithm in the information divergence, and reveals an
                 intimate connection between the latent maximum entropy
                 and maximum likelihood principles. To select a final
                 model, we generate a series of feasible candidates,
                 calculate the entropy of each, and choose the model
                 that attains the highest entropy. Our experimental
                 results show that estimation based on the latent
                 maximum entropy principle generally gives better
                 results than maximum likelihood when estimating latent
                 variable models on small observed data samples.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Bhattacharya:2012:CGC,
  author =       "Indrajit Bhattacharya and Shantanu Godbole and
                 Sachindra Joshi and Ashish Verma",
  title =        "Cross-Guided Clustering: Transfer of Relevant
                 Supervision across Tasks",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2297456.2297461",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Lack of supervision in clustering algorithms often
                 leads to clusters that are not useful or interesting to
                 human reviewers. We investigate if supervision can be
                 automatically transferred for clustering a target task,
                 by providing a relevant supervised partitioning of a
                 dataset from a different source task. The target
                 clustering is made more meaningful for the human user
                 by trading-off intrinsic clustering goodness on the
                 target task for alignment with relevant supervised
                 partitions in the source task, wherever possible. We
                 propose a cross-guided clustering algorithm that builds
                 on traditional k-means by aligning the target clusters
                 with source partitions. The alignment process makes use
                 of a cross-task similarity measure that discovers
                 hidden relationships across tasks. When the source and
                 target tasks correspond to different domains with
                 potentially different vocabularies, we propose a
                 projection approach using pivot vocabularies for the
                 cross-domain similarity measure. Using multiple
                 real-world and synthetic datasets, we show that our
                 approach improves clustering accuracy significantly
                 over traditional k-means and state-of-the-art
                 semi-supervised clustering baselines, over a wide range
                 of data characteristics and parameter settings.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2012:LBN,
  author =       "Zhenxing Wang and Laiwan Chan",
  title =        "Learning {Bayesian} networks from {Markov} random
                 fields: an efficient algorithm for linear models",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362383.2362384",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Dependency analysis is a typical approach for Bayesian
                 network learning, which infers the structures of
                 Bayesian networks by the results of a series of
                 conditional independence (CI) tests. In practice,
                 testing independence conditioning on large sets hampers
                 the performance of dependency analysis algorithms in
                 terms of accuracy and running time for the following
                 reasons. First, testing independence on large sets of
                 variables with limited samples is not stable. Second,
                 for most dependency analysis algorithms, the number of
                 CI tests grows at an exponential rate with the sizes of
                 conditioning sets, and the running time grows of the
                 same rate. Therefore, determining how to reduce the
                 number of CI tests and the sizes of conditioning sets
                 becomes a critical step in dependency analysis
                 algorithms. In this article, we address a two-phase
                 algorithm based on the observation that the structures
                 of Markov random fields are similar to those of
                 Bayesian networks. The first phase of the algorithm
                 constructs a Markov random field from data, which
                 provides a close approximation to the structure of the
                 true Bayesian network; the second phase of the
                 algorithm removes redundant edges according to CI tests
                 to get the true Bayesian network. Both phases use
                 Markov blanket information to reduce the sizes of
                 conditioning sets and the number of CI tests without
                 sacrificing accuracy. An empirical study shows that the
                 two-phase algorithm performs well in terms of accuracy
                 and efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chan:2012:CID,
  author =       "Jeffrey Chan and James Bailey and Christopher Leckie
                 and Michael Houle",
  title =        "{ciForager}: Incrementally discovering regions of
                 correlated change in evolving graphs",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362383.2362385",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data mining techniques for understanding how graphs
                 evolve over time have become increasingly important.
                 Evolving graphs arise naturally in diverse applications
                 such as computer network topologies, multiplayer games
                 and medical imaging. A natural and interesting problem
                 in evolving graph analysis is the discovery of compact
                 subgraphs that change in a similar manner. Such
                 subgraphs are known as regions of correlated change and
                 they can both summarise change patterns in graphs and
                 help identify the underlying events causing these
                 changes. However, previous techniques for discovering
                 regions of correlated change suffer from limited
                 scalability, making them unsuitable for analysing the
                 evolution of very large graphs. In this paper, we
                 introduce a new algorithm called ciForager, that
                 addresses this scalability challenge and offers
                 considerable improvements. The efficiency of ciForager
                 is based on the use of new incremental techniques for
                 detecting change, as well as the use of Voronoi
                 representations for efficiently determining distance.
                 We experimentally show that ciForager can achieve
                 speedups of up to 1000 times over previous approaches.
                 As a result, it becomes feasible for the first time to
                 discover regions of correlated change in extremely
                 large graphs, such as the entire BGP routing topology
                 of the Internet.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2012:CDS,
  author =       "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong
                 Gong",
  title =        "Comparative document summarization via discriminative
                 sentence selection",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362383.2362386",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a collection of document groups, a natural
                 question is to identify the differences among them.
                 Although traditional document summarization techniques
                 can summarize the content of the document groups one by
                 one, there exists a great necessity to generate a
                 summary of the differences among the document groups.
                 In this article, we study a novel problem, that of
                 summarizing the differences between document groups. A
                 discriminative sentence selection method is proposed to
                 extract the most discriminative sentences which
                 represent the specific characteristics of each document
                 group. Experiments and case studies on real-world data
                 sets demonstrate the effectiveness of our proposed
                 method.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{deMelo:2012:FNO,
  author =       "Pedro O. S. {Vaz de Melo} and Virgilio A. F. Almeida
                 and Antonio A. F. Loureiro and Christos Faloutsos",
  title =        "Forecasting in the {NBA} and other team sports:
                 Network effects in action",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362383.2362387",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The multi-million sports-betting market is based on
                 the fact that the task of predicting the outcome of a
                 sports event is very hard. Even with the aid of an
                 uncountable number of descriptive statistics and
                 background information, only a few can correctly guess
                 the outcome of a game or a league. In this work, our
                 approach is to move away from the traditional way of
                 predicting sports events, and instead to model sports
                 leagues as networks of players and teams where the only
                 information available is the work relationships among
                 them. We propose two network-based models to predict
                 the behavior of teams in sports leagues. These models
                 are parameter-free, that is, they do not have a single
                 parameter, and moreover are sport-agnostic: they can be
                 applied directly to any team sports league. First, we
                 view a sports league as a network in evolution, and we
                 infer the implicit feedback behind network changes and
                 properties over the years. Then, we use this knowledge
                 to construct the network-based prediction models, which
                 can, with a significantly high probability, indicate
                 how well a team will perform over a season. We compare
                 our proposed models with other prediction models in two
                 of the most popular sports leagues: the National
                 Basketball Association (NBA) and the Major League
                 Baseball (MLB). Our model shows consistently good
                 results in comparison with the other models and,
                 relying upon the network properties of the teams, we
                 achieved a $\approx 14\%$ rank prediction accuracy
                 improvement over our best competitor.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Ghosh:2012:SIB,
  author =       "Joydeep Ghosh and Padhraic Smyth and Andrew Tomkins
                 and Rich Caruana",
  title =        "Special issue on best of {SIGKDD 2011}",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2382577.2382578",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Kaufman:2012:LDM,
  author =       "Shachar Kaufman and Saharon Rosset and Claudia Perlich
                 and Ori Stitelman",
  title =        "Leakage in data mining: Formulation, detection, and
                 avoidance",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2382577.2382579",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Deemed ``one of the top ten data mining mistakes'',
                 leakage is the introduction of information about the
                 data mining target that should not be legitimately
                 available to mine from. In addition to our own industry
                 experience with real-life projects, controversies
                 around several major public data mining competitions
                 held recently such as the INFORMS 2010 Data Mining
                 Challenge and the IJCNN 2011 Social Network Challenge
                 are evidence that this issue is as relevant today as it
                 has ever been. While acknowledging the importance and
                 prevalence of leakage in both synthetic competitions
                 and real-life data mining projects, existing literature
                 has largely left this idea unexplored. What little has
                 been said turns out not to be broad enough to cover
                 more complex cases of leakage, such as those where the
                 classical independently and identically distributed
                 (i.i.d.) assumption is violated, that have been
                 recently documented. In our new approach, these cases
                 and others are explained by explicitly defining
                 modeling goals and analyzing the broader framework of
                 the data mining problem. The resulting definition
                 enables us to derive general methodology for dealing
                 with the issue. We show that it is possible to avoid
                 leakage with a simple specific approach to data
                 management followed by what we call a learn-predict
                 separation, and present several ways of detecting
                 leakage when the modeler has no control over how the
                 data have been collected. We also offer an alternative
                 point of view on leakage that is based on causal graph
                 modeling concepts.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Mampaey:2012:SDS,
  author =       "Michael Mampaey and Jilles Vreeken and Nikolaj Tatti",
  title =        "Summarizing data succinctly with the most informative
                 itemsets",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2382577.2382580",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Knowledge discovery from data is an inherently
                 iterative process. That is, what we know about the data
                 greatly determines our expectations, and therefore,
                 what results we would find interesting and/or
                 surprising. Given new knowledge about the data, our
                 expectations will change. Hence, in order to avoid
                 redundant results, knowledge discovery algorithms
                 ideally should follow such an iterative updating
                 procedure. With this in mind, we introduce a
                 well-founded approach for succinctly summarizing data
                 with the most informative itemsets; using a
                 probabilistic maximum entropy model, we iteratively
                 find the itemset that provides us the most novel
                 information-that is, for which the frequency in the
                 data surprises us the most-and in turn we update our
                 model accordingly. As we use the maximum entropy
                 principle to obtain unbiased probabilistic models, and
                 only include those itemsets that are most informative
                 with regard to the current model, the summaries we
                 construct are guaranteed to be both descriptive and
                 nonredundant. The algorithm that we present, called
                 mtv, can either discover the top- k most informative
                 itemsets, or we can employ either the Bayesian
                 Information Criterion (bic) or the Minimum Description
                 Length (mdl) principle to automatically identify the
                 set of itemsets that together summarize the data well.
                 In other words, our method will ``tell you what you
                 need to know'' about the data. Importantly, it is a
                 one-phase algorithm: rather than picking itemsets from
                 a user-provided candidate set, itemsets and their
                 supports are mined on-the-fly. To further its
                 applicability, we provide an efficient method to
                 compute the maximum entropy distribution using Quick
                 Inclusion-Exclusion. Experiments on our method, using
                 synthetic, benchmark, and real data, show that the
                 discovered summaries are succinct, and correctly
                 identify the key patterns in the data. The models they
                 form attain high likelihoods, and inspection shows that
                 they summarize the data well with increasingly
                 specific, yet nonredundant itemsets.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chu:2012:TLM,
  author =       "Shumo Chu and James Cheng",
  title =        "Triangle listing in massive networks",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2382577.2382581",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Triangle listing is one of the fundamental algorithmic
                 problems whose solution has numerous applications
                 especially in the analysis of complex networks, such as
                 the computation of clustering coefficients,
                 transitivity, triangular connectivity, trusses, etc.
                 Existing algorithms for triangle listing are mainly
                 in-memory algorithms, whose performance cannot scale
                 with the massive volume of today's fast growing
                 networks. When the input graph cannot fit in main
                 memory, triangle listing requires random disk accesses
                 that can incur prohibitively huge I/O cost. Some
                 streaming, semistreaming, and sampling algorithms have
                 been proposed but these are approximation algorithms.
                 We propose an I/O-efficient algorithm for triangle
                 listing. Our algorithm is exact and avoids random disk
                 access. Our results show that our algorithm is scalable
                 and outperforms the state-of-the-art in-memory and
                 local triangle estimation algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chattopadhyay:2012:MDA,
  author =       "Rita Chattopadhyay and Qian Sun and Wei Fan and Ian
                 Davidson and Sethuraman Panchanathan and Jieping Ye",
  title =        "Multisource domain adaptation and its application to
                 early detection of fatigue",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2382577.2382582",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We consider the characterization of muscle fatigue
                 through a noninvasive sensing mechanism such as Surface
                 ElectroMyoGraphy (SEMG). While changes in the
                 properties of SEMG signals with respect to muscle
                 fatigue have been reported in the literature, the large
                 variation in these signals across different individuals
                 makes the task of modeling and classification of SEMG
                 signals challenging. Indeed, the variation in SEMG
                 parameters from subject to subject creates differences
                 in the data distribution. In this article, we propose
                 two transfer learning frameworks based on the
                 multisource domain adaptation methodology for detecting
                 different stages of fatigue using SEMG signals, that
                 addresses the distribution differences. In the proposed
                 frameworks, the SEMG data of a subject represent a
                 domain; data from multiple subjects in the training set
                 form the multiple source domains and the test subject
                 data form the target domain. SEMG signals are
                 predominantly different in conditional probability
                 distribution across subjects. The key feature of the
                 first framework is a novel weighting scheme that
                 addresses the conditional probability distribution
                 differences across multiple domains (subjects) and the
                 key feature of the second framework is a two-stage
                 domain adaptation methodology which combines weighted
                 data from multiple sources based on marginal
                 probability differences (first stage) as well as
                 conditional probability differences (second stage),
                 with the target domain data. The weights for minimizing
                 the marginal probability differences are estimated
                 independently, while the weights for minimizing
                 conditional probability differences are computed
                 simultaneously by exploiting the potential interaction
                 among multiple sources. We also provide a theoretical
                 analysis on the generalization performance of the
                 proposed multisource domain adaptation formulation
                 using the weighted Rademacher complexity measure. We
                 have validated the proposed frameworks on Surface
                 ElectroMyoGram signals collected from 8 people during a
                 fatigue-causing repetitive gripping activity.
                 Comprehensive experiments on the SEMG dataset
                 demonstrate that the proposed method improves the
                 classification accuracy by 20\% to 30\% over the cases
                 without any domain adaptation method and by 13\% to
                 30\% over existing state-of-the-art domain adaptation
                 methods.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wilkinson:2012:SIS,
  author =       "Leland Wilkinson and Anushka Anand and Tuan Nhon
                 Dang",
  title =        "Substantial improvements in the set-covering
                 projection classifier {CHIRP} (composite hypercubes on
                 iterated random projections)",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2382577.2382583",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In Wilkinson et al. [2011] we introduced a new
                 set-covering random projection classifier that achieved
                 average error lower than that of other classifiers in
                 the Weka platform. This classifier was based on an
                 $L^\infty$ norm distance function and exploited an
                 iterative sequence of three stages (projecting,
                 binning, and covering) to deal with the curse of
                 dimensionality, computational complexity, and nonlinear
                 separability. We now present substantial changes that
                 improve robustness and reduce training and testing time
                 by almost an order of magnitude without jeopardizing
                 CHIRP's outstanding error performance.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Angiulli:2013:NNB,
  author =       "Fabrizio Angiulli and Fabio Fassetti",
  title =        "Nearest Neighbor-Based Classification of Uncertain
                 Data",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2435209.2435210",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This work deals with the problem of classifying
                 uncertain data. With this aim we introduce the
                 Uncertain Nearest Neighbor (UNN) rule, which represents
                 the generalization of the deterministic nearest
                 neighbor rule to the case in which uncertain objects
                 are available. The UNN rule relies on the concept of
                 nearest neighbor class, rather than on that of nearest
                 neighbor object. The nearest neighbor class of a test
                 object is the class that maximizes the probability of
                 providing its nearest neighbor. The evidence is that
                 the former concept is much more powerful than the
                 latter in the presence of uncertainty, in that it
                 correctly models the right semantics of the nearest
                 neighbor decision rule when applied to the uncertain
                 scenario. An effective and efficient algorithm to
                 perform uncertain nearest neighbor classification of a
                 generic (un)certain test object is designed, based on
                 properties that greatly reduce the temporal cost
                 associated with nearest neighbor class probability
                 computation. Experimental results are presented,
                 showing that the UNN rule is effective and efficient in
                 classifying uncertain data.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2013:CDS,
  author =       "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong
                 Gong",
  title =        "Comparative Document Summarization via Discriminative
                 Sentence Selection",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2435209.2435211",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a collection of document groups, a natural
                 question is to identify the differences among these
                 groups. Although traditional document summarization
                 techniques can summarize the content of the document
                 groups one by one, there exists a great necessity to
                 generate a summary of the differences among the
                 document groups. In this article, we study a novel
                 problem of summarizing the differences between document
                 groups. A discriminative sentence selection method is
                 proposed to extract the most discriminative sentences
                 that represent the specific characteristics of each
                 document group. Experiments and case studies on
                 real-world data sets demonstrate the effectiveness of
                 our proposed method.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Bayati:2013:MPA,
  author =       "Mohsen Bayati and David F. Gleich and Amin Saberi and
                 Ying Wang",
  title =        "Message-Passing Algorithms for Sparse Network
                 Alignment",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2435209.2435212",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Network alignment generalizes and unifies several
                 approaches for forming a matching or alignment between
                 the vertices of two graphs. We study a mathematical
                 programming framework for network alignment problem and
                 a sparse variation of it where only a small number of
                 matches between the vertices of the two graphs are
                 possible. We propose a new message passing algorithm
                 that allows us to compute, very efficiently,
                 approximate solutions to the sparse network alignment
                 problems with graph sizes as large as hundreds of
                 thousands of vertices. We also provide extensive
                 simulations comparing our algorithms with two of the
                 best solvers for network alignment problems on two
                 synthetic matching problems, two bioinformatics
                 problems, and three large ontology alignment problems
                 including a multilingual problem with a known labeled
                 alignment.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Li:2013:CWM,
  author =       "Bin Li and Steven C. H. Hoi and Peilin Zhao and
                 Vivekanand Gopalkrishnan",
  title =        "Confidence Weighted Mean Reversion Strategy for Online
                 Portfolio Selection",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2435209.2435213",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Online portfolio selection has been attracting
                 increasing attention from the data mining and machine
                 learning communities. All existing online portfolio
                 selection strategies focus on the first order
                 information of a portfolio vector, though the second
                 order information may also be beneficial to a strategy.
                 Moreover, empirical evidence shows that relative stock
                 prices may follow the mean reversion property, which
                 has not been fully exploited by existing strategies.
                 This article proposes a novel online portfolio
                 selection strategy named Confidence Weighted Mean
                 Reversion (CWMR). Inspired by the mean reversion
                 principle in finance and confidence weighted online
                 learning technique in machine learning, CWMR models the
                 portfolio vector as a Gaussian distribution, and
                 sequentially updates the distribution by following the
                 mean reversion trading principle. CWMR's closed-form
                 updates clearly reflect the mean reversion trading
                 idea. We also present several variants of CWMR
                 algorithms, including a CWMR mixture algorithm that is
                 theoretical universal. Empirically, CWMR strategy is
                 able to effectively exploit the power of mean reversion
                 for online portfolio selection. Extensive experiments
                 on various real markets show that the proposed strategy
                 is superior to the state-of-the-art techniques. The
                 experimental testbed including source codes and data
                 sets is available online.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Lou:2013:LPR,
  author =       "Tiancheng Lou and Jie Tang and John Hopcroft and
                 Zhanpeng Fang and Xiaowen Ding",
  title =        "Learning to predict reciprocity and triadic closure in
                 social networks",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499907.2499908",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We study how links are formed in social networks. In
                 particular, we focus on investigating how a reciprocal
                 (two-way) link, the basic relationship in social
                 networks, is developed from a parasocial (one-way)
                 relationship and how the relationships further develop
                 into triadic closure, one of the fundamental processes
                 of link formation. We first investigate how geographic
                 distance and interactions between users influence the
                 formation of link structure among users. Then we study
                 how social theories including homophily, social
                 balance, and social status are satisfied over networks
                 with parasocial and reciprocal relationships. The study
                 unveils several interesting phenomena. For example,
                 ``friend's friend is a friend'' indeed exists in the
                 reciprocal relationship network, but does not hold in
                 the parasocial relationship network. We propose a
                 learning framework to formulate the problems of
                 predicting reciprocity and triadic closure into a
                 graphical model. We demonstrate that it is possible to
                 accurately infer 90\% of reciprocal relationships in a
                 Twitter network. The proposed model also achieves
                 better performance (+20--30\% in terms of F1-measure)
                 than several alternative methods for predicting the
                 triadic closure formation.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Yang:2013:EOL,
  author =       "Haiqin Yang and Michael R. Lyu and Irwin King",
  title =        "Efficient online learning for multitask feature
                 selection",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499907.2499909",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Learning explanatory features across multiple related
                 tasks, or MultiTask Feature Selection (MTFS), is an
                 important problem in the applications of data mining,
                 machine learning, and bioinformatics. Previous MTFS
                 methods fulfill this task by batch-mode training. This
                 makes them inefficient when data come sequentially or
                 when the number of training data is so large that they
                 cannot be loaded into the memory simultaneously. In
                 order to tackle these problems, we propose a novel
                 online learning framework to solve the MTFS problem. A
                 main advantage of the online algorithm is its
                 efficiency in both time complexity and memory cost. The
                 weights of the MTFS models at each iteration can be
                 updated by closed-form solutions based on the average
                 of previous subgradients. This yields the worst-case
                 bounds of the time complexity and memory cost at each
                 iteration, both in the order of O ( d $ \times $ Q ),
                 where d is the number of feature dimensions and Q is
                 the number of tasks. Moreover, we provide theoretical
                 analysis for the average regret of the online learning
                 algorithms, which also guarantees the convergence rate
                 of the algorithms. Finally, we conduct detailed
                 experiments to show the characteristics and merits of
                 the online learning algorithms in solving several MTFS
                 problems.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Zhang:2013:MRL,
  author =       "Yu Zhang and Dit-Yan Yeung",
  title =        "Multilabel relationship learning",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499907.2499910",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multilabel learning problems are commonly found in
                 many applications. A characteristic shared by many
                 multilabel learning problems is that some labels have
                 significant correlations between them. In this article,
                 we propose a novel multilabel learning method, called
                 MultiLabel Relationship Learning (MLRL), which extends
                 the conventional support vector machine by explicitly
                 learning and utilizing the relationships between
                 labels. Specifically, we model the label relationships
                 using a label covariance matrix and use it to define a
                 new regularization term for the optimization problem.
                 MLRL learns the model parameters and the label
                 covariance matrix simultaneously based on a unified
                 convex formulation. To solve the convex optimization
                 problem, we use an alternating method in which each
                 subproblem can be solved efficiently. The relationship
                 between MLRL and two widely used maximum margin methods
                 for multilabel learning is investigated. Moreover, we
                 also propose a semisupervised extension of MLRL, called
                 SSMLRL, to demonstrate how to make use of unlabeled
                 data to help learn the label covariance matrix. Through
                 experiments conducted on some multilabel applications,
                 we find that MLRL not only gives higher classification
                 accuracy but also has better interpretability as
                 revealed by the label covariance matrix.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Peng:2013:EFF,
  author =       "Jing Peng and Guna Seetharaman and Wei Fan and Aparna
                 Varde",
  title =        "Exploiting {Fisher} and {Fukunaga--Koontz} transforms
                 in {Chernoff} dimensionality reduction",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499907.2499911",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Knowledge discovery from big data demands effective
                 representation of data. However, big data are often
                 characterized by high dimensionality, which makes
                 knowledge discovery more difficult. Many techniques for
                 dimensionality reduction have been proposed, including
                 well-known Fisher's Linear Discriminant Analysis (LDA).
                 However, the Fisher criterion is incapable of dealing
                 with heteroscedasticity in the data. A technique based
                 on the Chernoff criterion for linear dimensionality
                 reduction has been proposed that is capable of
                 exploiting heteroscedastic information in the data.
                 While the Chernoff criterion has been shown to
                 outperform the Fisher's, a clear understanding of its
                 exact behavior is lacking. In this article, we show
                 precisely what can be expected from the Chernoff
                 criterion. In particular, we show that the Chernoff
                 criterion exploits the Fisher and Fukunaga-Koontz
                 transforms in computing its linear discriminants.
                 Furthermore, we show that a recently proposed
                 decomposition of the data space into four subspaces is
                 incomplete. We provide arguments on how to best enrich
                 the decomposition of the data space in order to account
                 for heteroscedasticity in the data. Finally, we provide
                 experimental results validating our theoretical
                 analysis.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Agarwal:2013:ISI,
  author =       "Deepak Agarwal and Rich Caruana and Jian Pei and Ke
                 Wang",
  title =        "Introduction to the {Special Issue ACM SIGKDD 2012}",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2513092.2513093",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Rakthanmanon:2013:ABD,
  author =       "Thanawin Rakthanmanon and Bilson Campana and Abdullah
                 Mueen and Gustavo Batista and Brandon Westover and
                 Qiang Zhu and Jesin Zakaria and Eamonn Keogh",
  title =        "Addressing Big Data Time Series: Mining Trillions of
                 Time Series Subsequences Under Dynamic Time Warping",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2500489",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Most time series data mining algorithms use similarity
                 search as a core subroutine, and thus the time taken
                 for similarity search is the bottleneck for virtually
                 all time series data mining algorithms, including
                 classification, clustering, motif discovery, anomaly
                 detection, and so on. The difficulty of scaling a
                 search to large datasets explains to a great extent why
                 most academic work on time series data mining has
                 plateaued at considering a few millions of time series
                 objects, while much of industry and science sits on
                 billions of time series objects waiting to be explored.
                 In this work we show that by using a combination of
                 four novel ideas we can search and mine massive time
                 series for the first time. We demonstrate the following
                 unintuitive fact: in large datasets we can exactly
                 search under Dynamic Time Warping (DTW) much more
                 quickly than the current state-of-the-art Euclidean
                 distance search algorithms. We demonstrate our work on
                 the largest set of time series experiments ever
                 attempted. In particular, the largest dataset we
                 consider is larger than the combined size of all of the
                 time series datasets considered in all data mining
                 papers ever published. We explain how our ideas allow
                 us to solve higher-level time series data mining
                 problems such as motif discovery and clustering at
                 scales that would otherwise be untenable. Moreover, we
                 show how our ideas allow us to efficiently support the
                 uniform scaling distance measure, a measure whose
                 utility seems to be underappreciated, but which we
                 demonstrate here. In addition to mining massive
                 datasets with up to one trillion datapoints, we will
                 show that our ideas also have implications for
                 real-time monitoring of data streams, allowing us to
                 handle much faster arrival rates and/or use cheaper and
                 lower powered devices than are currently possible.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Sun:2013:PIM,
  author =       "Yizhou Sun and Brandon Norick and Jiawei Han and
                 Xifeng Yan and Philip S. Yu and Xiao Yu",
  title =        "{PathSelClus}: Integrating Meta-Path Selection with
                 User-Guided Object Clustering in Heterogeneous
                 Information Networks",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2500492",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Real-world, multiple-typed objects are often
                 interconnected, forming heterogeneous information
                 networks. A major challenge for link-based clustering
                 in such networks is their potential to generate many
                 different results, carrying rather diverse semantic
                 meanings. In order to generate desired clustering, we
                 propose to use meta-path, a path that connects object
                 types via a sequence of relations, to control
                 clustering with distinct semantics. Nevertheless, it is
                 easier for a user to provide a few examples (seeds)
                 than a weighted combination of sophisticated meta-paths
                 to specify her clustering preference. Thus, we propose
                 to integrate meta-path selection with user-guided
                 clustering to cluster objects in networks, where a user
                 first provides a small set of object seeds for each
                 cluster as guidance. Then the system learns the weight
                 for each meta-path that is consistent with the
                 clustering result implied by the guidance, and
                 generates clusters under the learned weights of
                 meta-paths. A probabilistic approach is proposed to
                 solve the problem, and an effective and efficient
                 iterative algorithm, PathSelClus, is proposed to learn
                 the model, where the clustering quality and the
                 meta-path weights mutually enhance each other. Our
                 experiments with several clustering tasks in two real
                 networks and one synthetic network demonstrate the
                 power of the algorithm in comparison with the
                 baselines.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Bellare:2013:ASE,
  author =       "Kedar Bellare and Suresh Iyengar and Aditya
                 Parameswaran and Vibhor Rastogi",
  title =        "Active Sampling for Entity Matching with Guarantees",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2500490",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In entity matching, a fundamental issue while training
                 a classifier to label pairs of entities as either
                 duplicates or nonduplicates is the one of selecting
                 informative training examples. Although active learning
                 presents an attractive solution to this problem,
                 previous approaches minimize the misclassification rate
                 (0--1 loss) of the classifier, which is an unsuitable
                 metric for entity matching due to class imbalance
                 (i.e., many more nonduplicate pairs than duplicate
                 pairs). To address this, a recent paper [Arasu et al.
                 2010] proposes to maximize recall of the classifier
                 under the constraint that its precision should be
                 greater than a specified threshold. However, the
                 proposed technique requires the labels of all n input
                 pairs in the worst case. Our main result is an active
                 learning algorithm that approximately maximizes recall
                 of the classifier while respecting a precision
                 constraint with provably sublinear label complexity
                 (under certain distributional assumptions). Our
                 algorithm uses as a black box any active learning
                 module that minimizes 0--1 loss. We show that label
                 complexity of our algorithm is at most log n times the
                 label complexity of the black box, and also bound the
                 difference in the recall of classifier learnt by our
                 algorithm and the recall of the optimal classifier
                 satisfying the precision constraint. We provide an
                 empirical evaluation of our algorithm on several
                 real-world matching data sets that demonstrates the
                 effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chattopadhyay:2013:BMA,
  author =       "Rita Chattopadhyay and Zheng Wang and Wei Fan and Ian
                 Davidson and Sethuraman Panchanathan and Jieping Ye",
  title =        "Batch Mode Active Sampling Based on Marginal
                 Probability Distribution Matching",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2513092.2513094",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Active Learning is a machine learning and data mining
                 technique that selects the most informative samples for
                 labeling and uses them as training data; it is
                 especially useful when there are large amount of
                 unlabeled data and labeling them is expensive.
                 Recently, batch-mode active learning, where a set of
                 samples are selected concurrently for labeling, based
                 on their collective merit, has attracted a lot of
                 attention. The objective of batch-mode active learning
                 is to select a set of informative samples so that a
                 classifier learned on these samples has good
                 generalization performance on the unlabeled data. Most
                 of the existing batch-mode active learning
                 methodologies try to achieve this by selecting samples
                 based on certain criteria. In this article we propose a
                 novel criterion which achieves good generalization
                 performance of a classifier by specifically selecting a
                 set of query samples that minimize the difference in
                 distribution between the labeled and the unlabeled
                 data, after annotation. We explicitly measure this
                 difference based on all candidate subsets of the
                 unlabeled data and select the best subset. The proposed
                 objective is an NP-hard integer programming
                 optimization problem. We provide two optimization
                 techniques to solve this problem. In the first one, the
                 problem is transformed into a convex quadratic
                 programming problem and in the second method the
                 problem is transformed into a linear programming
                 problem. Our empirical studies using publicly available
                 UCI datasets and two biomedical image databases
                 demonstrate the effectiveness of the proposed approach
                 in comparison with the state-of-the-art batch-mode
                 active learning methods. We also present two extensions
                 of the proposed approach, which incorporate uncertainty
                 of the predicted labels of the unlabeled data and
                 transfer learning in the proposed formulation. In
                 addition, we present a joint optimization framework for
                 performing both transfer and active learning
                 simultaneously unlike the existing approaches of
                 learning in two separate stages, that is, typically,
                 transfer learning followed by active learning. We
                 specifically minimize a common objective of reducing
                 distribution difference between the domain adapted
                 source, the queried and labeled samples and the rest of
                 the unlabeled target domain data. Our empirical studies
                 on two biomedical image databases and on a publicly
                 available 20 Newsgroups dataset show that incorporation
                 of uncertainty information and transfer learning
                 further improves the performance of the proposed active
                 learning based classifier. Our empirical studies also
                 show that the proposed transfer-active method based on
                 the joint optimization framework performs significantly
                 better than a framework which implements transfer and
                 active learning in two separate stages.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Briggs:2013:IAM,
  author =       "Forrest Briggs and Xiaoli Z. Fern and Raviv Raich and
                 Qi Lou",
  title =        "Instance Annotation for Multi-Instance Multi-Label
                 Learning",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2500491",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multi-instance multi-label learning (MIML) is a
                 framework for supervised classification where the
                 objects to be classified are bags of instances
                 associated with multiple labels. For example, an image
                 can be represented as a bag of segments and associated
                 with a list of objects it contains. Prior work on MIML
                 has focused on predicting label sets for previously
                 unseen bags. We instead consider the problem of
                 predicting instance labels while learning from data
                 labeled only at the bag level. We propose a regularized
                 rank-loss objective designed for instance annotation,
                 which can be instantiated with different aggregation
                 models connecting instance-level labels with bag-level
                 label sets. The aggregation models that we consider can
                 be factored as a linear function of a ``support
                 instance'' for each class, which is a single feature
                 vector representing a whole bag. Hence we name our
                 proposed methods rank-loss Support Instance Machines
                 (SIM). We propose two optimization methods for the
                 rank-loss objective, which is nonconvex. One is a
                 heuristic method that alternates between updating
                 support instances, and solving a convex problem in
                 which the support instances are treated as constant.
                 The other is to apply the constrained concave-convex
                 procedure (CCCP), which can also be interpreted as
                 iteratively updating support instances and solving a
                 convex problem. To solve the convex problem, we employ
                 the Pegasos framework of primal subgradient descent,
                 and prove that it finds an $ \epsilon $-suboptimal
                 solution in runtime that is linear in the number of
                 bags, instances, and $ 1 / \epsilon $. Additionally, we
                 suggest a method of extending the linear learning
                 algorithm to nonlinear classification, without
                 increasing the runtime asymptotically. Experiments on
                 artificial and real-world datasets including images and
                 audio show that the proposed methods achieve higher
                 accuracy than other loss functions used in prior work,
                 e.g., Hamming loss, and recent work in ambiguous label
                 classification.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Ji:2013:PFR,
  author =       "Ming Ji and Binbin Lin and Xiaofei He and Deng Cai and
                 Jiawei Han",
  title =        "Parallel Field Ranking",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2513092.2513096",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recently, ranking data with respect to the intrinsic
                 geometric structure (manifold ranking) has received
                 considerable attentions, with encouraging performance
                 in many applications in pattern recognition,
                 information retrieval and recommendation systems. Most
                 of the existing manifold ranking methods focus on
                 learning a ranking function that varies smoothly along
                 the data manifold. However, beyond smoothness, a
                 desirable ranking function should vary monotonically
                 along the geodesics of the data manifold, such that the
                 ranking order along the geodesics is preserved. In this
                 article, we aim to learn a ranking function that varies
                 linearly and therefore monotonically along the
                 geodesics of the data manifold. Recent theoretical work
                 shows that the gradient field of a linear function on
                 the manifold has to be a parallel vector field.
                 Therefore, we propose a novel ranking algorithm on the
                 data manifolds, called Parallel Field Ranking.
                 Specifically, we try to learn a ranking function and a
                 vector field simultaneously. We require the vector
                 field to be close to the gradient field of the ranking
                 function, and the vector field to be as parallel as
                 possible. Moreover, we require the value of the ranking
                 function at the query point to be the highest, and then
                 decrease linearly along the manifold. Experimental
                 results on both synthetic data and real data
                 demonstrate the effectiveness of our proposed
                 algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Adali:2013:IPR,
  author =       "Sibel Adali and Malik Magdon-Ismail and Xiaohui Lu",
  title =        "{iHypR}: Prominence ranking in networks of
                 collaborations with hyperedges 1",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541268.2541269",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present a new algorithm called iHypR for computing
                 prominence of actors in social networks of
                 collaborations. Our algorithm builds on the assumption
                 that prominent actors collaborate on prominent objects,
                 and prominent objects are naturally grouped into
                 prominent clusters or groups (hyperedges in a graph).
                 iHypR makes use of the relationships between actors,
                 objects, and hyperedges to compute a global prominence
                 score for the actors in the network. We do not assume
                 the hyperedges are given in advance. Hyperedges
                 computed by our method can perform as well or even
                 better than ``true'' hyperedges. Our algorithm is
                 customized for networks of collaborations, but it is
                 generally applicable without further tuning. We show,
                 through extensive experimentation with three real-life
                 data sets and multiple external measures of prominence,
                 that our algorithm outperforms existing well-known
                 algorithms. Our work is the first to offer such an
                 extensive evaluation. We show that unlike most existing
                 algorithms, the performance is robust across multiple
                 measures of performance. Further, we give a detailed
                 study of the sensitivity of our algorithm to different
                 data sets and the design choices within the algorithm
                 that a user may wish to change. Our article illustrates
                 the various trade-offs that must be considered in
                 computing prominence in collaborative social
                 networks.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Huang:2013:STP,
  author =       "Jin Huang and Feiping Nie and Heng Huang and Yi-Cheng
                 Tu and Yu Lei",
  title =        "Social trust prediction using heterogeneous networks",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541268.2541270",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Along with increasing popularity of social websites,
                 online users rely more on the trustworthiness
                 information to make decisions, extract and filter
                 information, and tag and build connections with other
                 users. However, such social network data often suffer
                 from severe data sparsity and are not able to provide
                 users with enough information. Therefore, trust
                 prediction has emerged as an important topic in social
                 network research. Traditional approaches are primarily
                 based on exploring trust graph topology itself.
                 However, research in sociology and our life experience
                 suggest that people who are in the same social circle
                 often exhibit similar behaviors and tastes. To take
                 advantage of the ancillary information for trust
                 prediction, the challenge then becomes what to transfer
                 and how to transfer. In this article, we address this
                 problem by aggregating heterogeneous social networks
                 and propose a novel joint social networks mining (JSNM)
                 method. Our new joint learning model explores the
                 user-group-level similarity between correlated graphs
                 and simultaneously learns the individual graph
                 structure; therefore, the shared structures and
                 patterns from multiple social networks can be utilized
                 to enhance the prediction tasks. As a result, we not
                 only improve the trust prediction in the target graph
                 but also facilitate other information retrieval tasks
                 in the auxiliary graphs. To optimize the proposed
                 objective function, we use the alternative technique to
                 break down the objective function into several
                 manageable subproblems. We further introduce the
                 auxiliary function to solve the optimization problems
                 with rigorously proved convergence. The extensive
                 experiments have been conducted on both synthetic and
                 real- world data. All empirical results demonstrate the
                 effectiveness of our method.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Guzzo:2013:SIF,
  author =       "Antonella Guzzo and Luigi Moccia and Domenico
                 Sacc{\`a} and Edoardo Serra",
  title =        "Solving inverse frequent itemset mining with
                 infrequency constraints via large-scale linear
                 programs",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "18:1--18:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541268.2541271",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Inverse frequent set mining (IFM) is the problem of
                 computing a transaction database D satisfying given
                 support constraints for some itemsets, which are
                 typically the frequent ones. This article proposes a
                 new formulation of IFM, called IFM$_I$ (IFM with
                 infrequency constraints), where the itemsets that are
                 not listed as frequent are constrained to be
                 infrequent; that is, they must have a support less than
                 or equal to a specified unique threshold. An instance
                 of IFM$_I$ can be seen as an instance of the original
                 IFM by making explicit the infrequency constraints for
                 the minimal infrequent itemsets, corresponding to the
                 so-called negative generator border defined in the
                 literature. The complexity increase from PSPACE
                 (complexity of IFM) to NEXP (complexity of IFM$_I$) is
                 caused by the cardinality of the negative generator
                 border, which can be exponential in the original input
                 size. Therefore, the article introduces a specific
                 problem parameter $ \kappa $ that computes an upper
                 bound to this cardinality using a hypergraph
                 interpretation for which minimal infrequent itemsets
                 correspond to minimal transversals. By fixing a
                 constant k, the article formulates a $k$-bounded
                 definition of the problem, called $k$-IFM$_I$, that
                 collects all instances for which the value of the
                 parameter $ \kappa $ is less than or equal to $k$-its
                 complexity is in PSPACE as for IFM. The bounded problem
                 is encoded as an integer linear program with a large
                 number of variables (actually exponential w.r.t. the
                 number of constraints), which is thereafter
                 approximated by relaxing integer constraints-the
                 decision problem of solving the linear program is
                 proven to be in NP. In order to solve the linear
                 program, a column generation technique is used that is
                 a variation of the simplex method designed to solve
                 large-scale linear programs, in particular with a huge
                 number of variables. The method at each step requires
                 the solution of an auxiliary integer linear program,
                 which is proven to be NP hard in this case and for
                 which a greedy heuristic is presented. The resulting
                 overall column generation solution algorithm enjoys
                 very good scaling as evidenced by the intensive
                 experimentation, thereby paving the way for its
                 application in real-life scenarios.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Balcazar:2013:FCP,
  author =       "Jos{\'e} L. Balc{\'a}zar",
  title =        "Formal and computational properties of the confidence
                 boost of association rules",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541268.2541272",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Some existing notions of redundancy among association
                 rules allow for a logical-style characterization and
                 lead to irredundant bases of absolutely minimum size.
                 We push the intuition of redundancy further to find an
                 intuitive notion of novelty of an association rule,
                 with respect to other rules. Namely, an irredundant
                 rule is so because its confidence is higher than what
                 the rest of the rules would suggest; then, one can ask:
                 how much higher? We propose to measure such a sort of
                 novelty through the confidence boost of a rule. Acting
                 as a complement to confidence and support, the
                 confidence boost helps to obtain small and crisp sets
                 of mined association rules and solves the well-known
                 problem that, in certain cases, rules of negative
                 correlation may pass the confidence bound. We analyze
                 the properties of two versions of the notion of
                 confidence boost, one of them a natural generalization
                 of the other. We develop algorithms to filter rules
                 according to their confidence boost, compare the
                 concept to some similar notions in the literature, and
                 describe the results of some experimentation employing
                 the new notions on standard benchmark datasets. We
                 describe an open source association mining tool that
                 embodies one of our variants of confidence boost in
                 such a way that the data mining process does not
                 require the user to select any value for any
                 parameter.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Ang:2013:CPN,
  author =       "Hock Hee Ang and Vivekanand Gopalkrishnan and Steven
                 C. H. Hoi and Wee Keong Ng",
  title =        "Classification in {P2P} networks with cascade support
                 vector machines",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "20:1--20:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541268.2541273",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Classification in Peer-to-Peer (P2P) networks is
                 important to many real applications, such as
                 distributed intrusion detection, distributed
                 recommendation systems, and distributed antispam
                 detection. However, it is very challenging to perform
                 classification in P2P networks due to many practical
                 issues, such as scalability, peer dynamism, and
                 asynchronism. This article investigates the practical
                 techniques of constructing Support Vector Machine (SVM)
                 classifiers in the P2P networks. In particular, we
                 demonstrate how to efficiently cascade SVM in a P2P
                 network with the use of reduced SVM. In addition, we
                 propose to fuse the concept of cascade SVM with
                 bootstrap aggregation to effectively balance the
                 trade-off between classification accuracy, model
                 construction, and prediction cost. We provide
                 theoretical insights for the proposed solutions and
                 conduct an extensive set of empirical studies on a
                 number of large-scale datasets. Encouraging results
                 validate the efficacy of the proposed approach.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Chen:2014:ISI,
  author =       "Wei Chen and Jie Tang",
  title =        "Introduction to special issue on computational aspects
                 of social and information networks: Theory,
                 methodologies, and applications {(TKDD-CASIN)}",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2556608",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Yang:2014:USN,
  author =       "Zhi Yang and Christo Wilson and Xiao Wang and Tingting
                 Gao and Ben Y. Zhao and Yafei Dai",
  title =        "Uncovering social network {Sybils} in the wild",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2556609",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Sybil accounts are fake identities created to unfairly
                 increase the power or resources of a single malicious
                 user. Researchers have long known about the existence
                 of Sybil accounts in online communities such as
                 file-sharing systems, but they have not been able to
                 perform large-scale measurements to detect them or
                 measure their activities. In this article, we describe
                 our efforts to detect, characterize, and understand
                 Sybil account activity in the Renren Online Social
                 Network (OSN). We use ground truth provided by Renren
                 Inc. to build measurement-based Sybil detectors and
                 deploy them on Renren to detect more than 100,000 Sybil
                 accounts. Using our full dataset of 650,000 Sybils, we
                 examine several aspects of Sybil behavior. First, we
                 study their link creation behavior and find that
                 contrary to prior conjecture, Sybils in OSNs do not
                 form tight-knit communities. Next, we examine the
                 fine-grained behaviors of Sybils on Renren using
                 clickstream data. Third, we investigate
                 behind-the-scenes collusion between large groups of
                 Sybils. Our results reveal that Sybils with no explicit
                 social ties still act in concert to launch attacks.
                 Finally, we investigate enhanced techniques to identify
                 stealthy Sybils. In summary, our study advances the
                 understanding of Sybil behavior on OSNs and shows that
                 Sybils can effectively avoid existing community-based
                 Sybil detectors. We hope that our results will foster
                 new research on Sybil detection that is based on novel
                 types of Sybil features.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Jin:2014:SAR,
  author =       "Ruoming Jin and Victor E. Lee and Longjie Li",
  title =        "Scalable and axiomatic ranking of network role
                 similarity",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2518176",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A key task in analyzing social networks and other
                 complex networks is role analysis: describing and
                 categorizing nodes according to how they interact with
                 other nodes. Two nodes have the same role if they
                 interact with equivalent sets of neighbors. The most
                 fundamental role equivalence is automorphic
                 equivalence. Unfortunately, the fastest algorithms
                 known for graph automorphism are nonpolynomial.
                 Moreover, since exact equivalence is rare, a more
                 meaningful task is measuring the role similarity
                 between any two nodes. This task is closely related to
                 the structural or link-based similarity problem that
                 SimRank addresses. However, SimRank and other existing
                 similarity measures are not sufficient because they do
                 not guarantee to recognize automorphically or
                 structurally equivalent nodes. This article makes two
                 contributions. First, we present and justify several
                 axiomatic properties necessary for a role similarity
                 measure or metric. Second, we present RoleSim, a new
                 similarity metric that satisfies these axioms and can
                 be computed with a simple iterative algorithm. We
                 rigorously prove that RoleSim satisfies all of these
                 axiomatic properties. We also introduce Iceberg
                 RoleSim, a scalable algorithm that discovers all pairs
                 with RoleSim scores above a user-defined threshold $
                 \theta $. We demonstrate the interpretative power of
                 RoleSim on both both synthetic and real datasets.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Mcauley:2014:DSC,
  author =       "Julian Mcauley and Jure Leskovec",
  title =        "Discovering social circles in ego networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2556612",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "People's personal social networks are big and
                 cluttered, and currently there is no good way to
                 automatically organize them. Social networking sites
                 allow users to manually categorize their friends into
                 social circles (e.g., ``circles'' on Google+, and
                 ``lists'' on Facebook and Twitter). However, circles
                 are laborious to construct and must be manually updated
                 whenever a user's network grows. In this article, we
                 study the novel task of automatically identifying
                 users' social circles. We pose this task as a
                 multimembership node clustering problem on a user's ego
                 network, a network of connections between her friends.
                 We develop a model for detecting circles that combines
                 network structure as well as user profile information.
                 For each circle, we learn its members and the
                 circle-specific user profile similarity metric.
                 Modeling node membership to multiple circles allows us
                 to detect overlapping as well as hierarchically nested
                 circles. Experiments show that our model accurately
                 identifies circles on a diverse set of data from
                 Facebook, Google+, and Twitter, for all of which we
                 obtain hand-labeled ground truth.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Abrahao:2014:SFA,
  author =       "Bruno Abrahao and Sucheta Soundarajan and John
                 Hopcroft and Robert Kleinberg",
  title =        "A separability framework for analyzing community
                 structure",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2527231",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Four major factors govern the intricacies of community
                 extraction in networks: (1) the literature offers a
                 multitude of disparate community detection algorithms
                 whose output exhibits high structural variability
                 across the collection, (2) communities identified by
                 algorithms may differ structurally from real
                 communities that arise in practice, (3) there is no
                 consensus characterizing how to discriminate
                 communities from noncommunities, and (4) the
                 application domain includes a wide variety of networks
                 of fundamentally different natures. In this article, we
                 present a class separability framework to tackle these
                 challenges through a comprehensive analysis of
                 community properties. Our approach enables the
                 assessment of the structural dissimilarity among the
                 output of multiple community detection algorithms and
                 between the output of algorithms and communities that
                 arise in practice. In addition, our method provides us
                 with a way to organize the vast collection of community
                 detection algorithms by grouping those that behave
                 similarly. Finally, we identify the most discriminative
                 graph-theoretical properties of community signature and
                 the small subset of properties that account for most of
                 the biases of the different community detection
                 algorithms. We illustrate our approach with an
                 experimental analysis, which reveals nuances of the
                 structure of real and extracted communities. In our
                 experiments, we furnish our framework with the output
                 of 10 different community detection procedures,
                 representative of categories of popular algorithms
                 available in the literature, applied to a diverse
                 collection of large-scale real network datasets whose
                 domains span biology, online shopping, and social
                 systems. We also analyze communities identified by
                 annotations that accompany the data, which reflect
                 exemplar communities in various domain. We characterize
                 these communities using a broad spectrum of community
                 properties to produce the different structural classes.
                 As our experiments show that community structure is not
                 a universal concept, our framework enables an informed
                 choice of the most suitable community detection method
                 for identifying communities of a specific type in a
                 given network and allows for a comparison of existing
                 community detection algorithms while guiding the design
                 of new ones.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Zhong:2014:UBL,
  author =       "Erheng Zhong and Wei Fan and Qiang Yang",
  title =        "User behavior learning and transfer in composite
                 social networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2556613",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Accurate prediction of user behaviors is important for
                 many social media applications, including social
                 marketing, personalization, and recommendation. A major
                 challenge lies in that although many previous works
                 model user behavior from only historical behavior logs,
                 the available user behavior data or interactions
                 between users and items in a given social network are
                 usually very limited and sparse (e.g., $ \geq 99.9 \% $
                 empty), which makes models overfit the rare
                 observations and fail to provide accurate predictions.
                 We observe that many people are members of several
                 social networks in the same time, such as Facebook,
                 Twitter, and Tencent's QQ. Importantly, users'
                 behaviors and interests in different networks influence
                 one another. This provides an opportunity to leverage
                 the knowledge of user behaviors in different networks
                 by considering the overlapping users in different
                 networks as bridges, in order to alleviate the data
                 sparsity problem, and enhance the predictive
                 performance of user behavior modeling. Combining
                 different networks ``simply and naively'' does not work
                 well. In this article, we formulate the problem to
                 model multiple networks as ``adaptive composite
                 transfer'' and propose a framework called ComSoc.
                 ComSoc first selects the most suitable networks inside
                 a composite social network via a hierarchical Bayesian
                 model, parameterized for individual users. It then
                 builds topic models for user behavior prediction using
                 both the relationships in the selected networks and
                 related behavior data. With different relational
                 regularization, we introduce different implementations,
                 corresponding to different ways to transfer knowledge
                 from composite social relations. To handle big data, we
                 have implemented the algorithm using Map/Reduce. We
                 demonstrate that the proposed composite network-based
                 user behavior models significantly improve the
                 predictive accuracy over a number of existing
                 approaches on several real-world applications,
                 including a very large social networking dataset from
                 Tencent Inc.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Ahmed:2014:NSS,
  author =       "Nesreen K. Ahmed and Jennifer Neville and Ramana
                 Kompella",
  title =        "Network Sampling: From Static to Streaming Graphs",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2601438",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Network sampling is integral to the analysis of
                 social, information, and biological networks. Since
                 many real-world networks are massive in size,
                 continuously evolving, and/or distributed in nature,
                 the network structure is often sampled in order to
                 facilitate study. For these reasons, a more thorough
                 and complete understanding of network sampling is
                 critical to support the field of network science. In
                 this paper, we outline a framework for the general
                 problem of network sampling by highlighting the
                 different objectives, population and units of interest,
                 and classes of network sampling methods. In addition,
                 we propose a spectrum of computational models for
                 network sampling methods, ranging from the
                 traditionally studied model based on the assumption of
                 a static domain to a more challenging model that is
                 appropriate for streaming domains. We design a family
                 of sampling methods based on the concept of graph
                 induction that generalize across the full spectrum of
                 computational models (from static to streaming) while
                 efficiently preserving many of the topological
                 properties of the input graphs. Furthermore, we
                 demonstrate how traditional static sampling algorithms
                 can be modified for graph streams for each of the three
                 main classes of sampling methods: node, edge, and
                 topology-based sampling. Experimental results indicate
                 that our proposed family of sampling methods more
                 accurately preserve the underlying properties of the
                 graph in both static and streaming domains. Finally, we
                 study the impact of network sampling algorithms on the
                 parameter estimation and performance evaluation of
                 relational classification algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Ge:2014:RMA,
  author =       "Yong Ge and Guofei Jiang and Min Ding and Hui Xiong",
  title =        "Ranking Metric Anomaly in Invariant Networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2601436",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The management of large-scale distributed information
                 systems relies on the effective use and modeling of
                 monitoring data collected at various points in the
                 distributed information systems. A traditional approach
                 to model monitoring data is to discover invariant
                 relationships among the monitoring data. Indeed, we can
                 discover all invariant relationships among all pairs of
                 monitoring data and generate invariant networks, where
                 a node is a monitoring data source (metric) and a link
                 indicates an invariant relationship between two
                 monitoring data. Such an invariant network
                 representation can help system experts to localize and
                 diagnose the system faults by examining those broken
                 invariant relationships and their related metrics,
                 since system faults usually propagate among the
                 monitoring data and eventually lead to some broken
                 invariant relationships. However, at one time, there
                 are usually a lot of broken links (invariant
                 relationships) within an invariant network. Without
                 proper guidance, it is difficult for system experts to
                 manually inspect this large number of broken links. To
                 this end, in this article, we propose the problem of
                 ranking metrics according to the anomaly levels for a
                 given invariant network, while this is a nontrivial
                 task due to the uncertainties and the complex nature of
                 invariant networks. Specifically, we propose two types
                 of algorithms for ranking metric anomaly by link
                 analysis in invariant networks. Along this line, we
                 first define two measurements to quantify the anomaly
                 level of each metric, and introduce the m Rank
                 algorithm. Also, we provide a weighted score mechanism
                 and develop the g Rank algorithm, which involves an
                 iterative process to obtain a score to measure the
                 anomaly levels. In addition, some extended algorithms
                 based on m Rank and g Rank algorithms are developed
                 by taking into account the probability of being broken
                 as well as noisy links. Finally, we validate all the
                 proposed algorithms on a large number of real-world and
                 synthetic data sets to illustrate the effectiveness and
                 efficiency of different algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Zhang:2014:DGP,
  author =       "Gensheng Zhang and Xiao Jiang and Ping Luo and Min
                 Wang and Chengkai Li",
  title =        "Discovering General Prominent Streaks in Sequence
                 Data",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2601439",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article studies the problem of prominent streak
                 discovery in sequence data. Given a sequence of values,
                 a prominent streak is a long consecutive subsequence
                 consisting of only large (small) values, such as
                 consecutive games of outstanding performance in sports,
                 consecutive hours of heavy network traffic, and
                 consecutive days of frequent mentioning of a person in
                 social media. Prominent streak discovery provides
                 insightful data patterns for data analysis in many
                 real-world applications and is an enabling technique
                 for computational journalism. Given its real-world
                 usefulness and complexity, the research on prominent
                 streaks in sequence data opens a spectrum of
                 challenging problems. A baseline approach to finding
                 prominent streaks is a quadratic algorithm that
                 exhaustively enumerates all possible streaks and
                 performs pairwise streak dominance comparison. For more
                 efficient methods, we make the observation that
                 prominent streaks are in fact skyline points in two
                 dimensions-streak interval length and minimum value in
                 the interval. Our solution thus hinges on the idea to
                 separate the two steps in prominent streak discovery:
                 candidate streak generation and skyline operation over
                 candidate streaks. For candidate generation, we propose
                 the concept of local prominent streak (LPS). We prove
                 that prominent streaks are a subset of LPSs and the
                 number of LPSs is less than the length of a data
                 sequence, in comparison with the quadratic number of
                 candidates produced by the brute-force baseline method.
                 We develop efficient algorithms based on the concept of
                 LPS. The nonlinear local prominent streak (NLPS)-based
                 method considers a superset of LPSs as candidates, and
                 the linear local prominent streak (LLPS)-based method
                 further guarantees to consider only LPSs. The proposed
                 properties and algorithms are also extended for
                 discovering general top- k, multisequence, and
                 multidimensional prominent streaks. The results of
                 experiments using multiple real datasets verified the
                 effectiveness of the proposed methods and showed orders
                 of magnitude performance improvement against the
                 baseline method.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Schifanella:2014:MTD,
  author =       "Claudio Schifanella and K. Sel{\c{c}}uk Candan and
                 Maria Luisa Sapino",
  title =        "Multiresolution Tensor Decompositions with Mode
                 Hierarchies",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2532169",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Tensors (multidimensional arrays) are widely used for
                 representing high-order dimensional data, in
                 applications ranging from social networks, sensor data,
                 and Internet traffic. Multiway data analysis
                 techniques, in particular tensor decompositions, allow
                 extraction of hidden correlations among multiway data
                 and thus are key components of many data analysis
                 frameworks. Intuitively, these algorithms can be
                 thought of as multiway clustering schemes, which
                 consider multiple facets of the data in identifying
                 clusters, their weights, and contributions of each data
                 element. Unfortunately, algorithms for fitting multiway
                 models are, in general, iterative and very time
                 consuming. In this article, we observe that, in many
                 applications, there is a priori background knowledge
                 (or metadata) about one or more domain dimensions. This
                 metadata is often in the form of a hierarchy that
                 clusters the elements of a given data facet (or mode).
                 We investigate whether such single-mode data
                 hierarchies can be used to boost the efficiency of
                 tensor decomposition process, without significant
                 impact on the final decomposition quality. We consider
                 each domain hierarchy as a guide to help provide
                 higher- or lower-resolution views of the data in the
                 tensor on demand and we rely on these metadata-induced
                 multiresolution tensor representations to develop a
                 multiresolution approach to tensor decomposition. In
                 this article, we focus on an alternating least squares
                 (ALS)--based implementation of the two most important
                 decomposition models such as the PARAllel FACtors
                 (PARAFAC, which decomposes a tensor into a diagonal
                 tensor and a set of factor matrices) and the Tucker
                 (which produces as result a core tensor and a set of
                 dimension-subspaces matrices). Experiment results show
                 that, when the available metadata is used as a rough
                 guide, the proposed multiresolution method helps fit
                 both PARAFAC and Tucker models with consistent (under
                 different parameters settings) savings in execution
                 time and memory consumption, while preserving the
                 quality of the decomposition.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Huang:2014:RMN,
  author =       "Jin Huang and Feiping Nie and Heng Huang and Chris
                 Ding",
  title =        "Robust Manifold Nonnegative Matrix Factorization",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2601434",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Nonnegative Matrix Factorization (NMF) has been one of
                 the most widely used clustering techniques for
                 exploratory data analysis. However, since each data
                 point enters the objective function with squared
                 residue error, a few outliers with large errors easily
                 dominate the objective function. In this article, we
                 propose a Robust Manifold Nonnegative Matrix
                 Factorization (RMNMF) method using l$_{2, 1}$ -norm and
                 integrating NMF and spectral clustering under the same
                 clustering framework. We also point out the solution
                 uniqueness issue for the existing NMF methods and
                 propose an additional orthonormal constraint to address
                 this problem. With the new constraint, the conventional
                 auxiliary function approach no longer works. We tackle
                 this difficult optimization problem via a novel
                 Augmented Lagrangian Method (ALM)--based algorithm and
                 convert the original constrained optimization problem
                 on one variable into a multivariate constrained
                 problem. The new objective function then can be
                 decomposed into several subproblems that each has a
                 closed-form solution. More importantly, we reveal the
                 connection of our method with robust K -means and
                 spectral clustering, and we demonstrate its theoretical
                 significance. Extensive experiments have been conducted
                 on nine benchmark datasets, and all empirical results
                 show the effectiveness of our method.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Zhang:2014:RAL,
  author =       "Yu Zhang and Dit-Yan Yeung",
  title =        "A Regularization Approach to Learning Task
                 Relationships in Multitask Learning",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2538028",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multitask learning is a learning paradigm that seeks
                 to improve the generalization performance of a learning
                 task with the help of some other related tasks. In this
                 article, we propose a regularization approach to
                 learning the relationships between tasks in multitask
                 learning. This approach can be viewed as a novel
                 generalization of the regularized formulation for
                 single-task learning. Besides modeling positive task
                 correlation, our approach-multitask relationship
                 learning (MTRL)-can also describe negative task
                 correlation and identify outlier tasks based on the
                 same underlying principle. By utilizing a
                 matrix-variate normal distribution as a prior on the
                 model parameters of all tasks, our MTRL method has a
                 jointly convex objective function. For efficiency, we
                 use an alternating method to learn the optimal model
                 parameters for each task as well as the relationships
                 between tasks. We study MTRL in the symmetric multitask
                 learning setting and then generalize it to the
                 asymmetric setting as well. We also discuss some
                 variants of the regularization approach to demonstrate
                 the use of other matrix-variate priors for learning
                 task relationships. Moreover, to gain more insight into
                 our model, we also study the relationships between MTRL
                 and some existing multitask learning methods.
                 Experiments conducted on a toy problem as well as
                 several benchmark datasets demonstrate the
                 effectiveness of MTRL as well as its high
                 interpretability revealed by the task covariance
                 matrix.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Lin:2014:SCR,
  author =       "Ming Lin and Shifeng Weng and Changshui Zhang",
  title =        "On the Sample Complexity of Random {Fourier} Features
                 for Online Learning: How Many Random {Fourier} Features
                 Do We Need?",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2611378",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We study the sample complexity of random Fourier
                 features for online kernel learning-that is, the number
                 of random Fourier features required to achieve good
                 generalization performance. We show that when the loss
                 function is strongly convex and smooth, online kernel
                 learning with random Fourier features can achieve an $
                 O (l o g T / T) $ bound for the excess risk with only $
                 O (1 / \lambda^2$) random Fourier features, where T is
                 the number of training examples and \lambda is the
                 modulus of strong convexity. This is a significant
                 improvement compared to the existing result for batch
                 kernel learning that requires $ O(T)$ random Fourier
                 features to achieve a generalization bound $ O(1 /
                 \sqrt T)$. Our empirical study verifies that online
                 kernel learning with a limited number of random Fourier
                 features can achieve similar generalization performance
                 as online learning using full kernel matrix. We also
                 present an enhanced online learning algorithm with
                 random Fourier features that improves the
                 classification performance by multiple passes of
                 training examples and a partial average.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Eyal:2014:PIM,
  author =       "Ron Eyal and Avi Rosenfeld and Sigal Sina and Sarit
                 Kraus",
  title =        "Predicting and Identifying Missing Node Information in
                 Social Networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2536775",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:23 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In recent years, social networks have surged in
                 popularity. One key aspect of social network research
                 is identifying important missing information that is
                 not explicitly represented in the network, or is not
                 visible to all. To date, this line of research
                 typically focused on finding the connections that are
                 missing between nodes, a challenge typically termed as
                 the link prediction problem. This article introduces
                 the missing node identification problem, where missing
                 members in the social network structure must be
                 identified. In this problem, indications of missing
                 nodes are assumed to exist. Given these indications and
                 a partial network, we must assess which indications
                 originate from the same missing node and determine the
                 full network structure. Toward solving this problem, we
                 present the missing node identification by spectral
                 clustering algorithm (MISC), an approach based on a
                 spectral clustering algorithm, combined with nodes'
                 pairwise affinity measures that were adopted from link
                 prediction research. We evaluate the performance of our
                 approach in different problem settings and scenarios,
                 using real-life data from Facebook. The results show
                 that our approach has beneficial results and can be
                 effective in solving the missing node identification
                 problem. In addition, this article also presents
                 R-MISC, which uses a sparse matrix representation,
                 efficient algorithms for calculating the nodes'
                 pairwise affinity, and a proprietary dimension
                 reduction technique to enable scaling the MISC
                 algorithm to large networks of more than 100,000 nodes.
                 Last, we consider problem settings where some of the
                 indications are unknown. Two algorithms are suggested
                 for this problem: speculative MISC, based on MISC, and
                 missing link completion, based on classical link
                 prediction literature. We show that speculative MISC
                 outperforms missing link completion.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Webb:2014:EDM,
  author =       "Geoffrey I. Webb and Jilles Vreeken",
  title =        "Efficient Discovery of the Most Interesting
                 Associations",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2601433",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:23 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Self-sufficient itemsets have been proposed as an
                 effective approach to summarizing the key associations
                 in data. However, their computation appears highly
                 demanding, as assessing whether an itemset is
                 self-sufficient requires consideration of all pairwise
                 partitions of the itemset into pairs of subsets as well
                 as consideration of all supersets. This article
                 presents the first published algorithm for efficiently
                 discovering self-sufficient itemsets. This
                 branch-and-bound algorithm deploys two powerful pruning
                 mechanisms based on upper bounds on itemset value and
                 statistical significance level. It demonstrates that
                 finding top- k productive and nonredundant itemsets,
                 with postprocessing to identify those that are not
                 independently productive, can efficiently identify
                 small sets of key associations. We present extensive
                 evaluation of the strengths and limitations of the
                 technique, including comparisons with alternative
                 approaches to finding the most interesting
                 associations.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Shabtai:2014:ODM,
  author =       "Asaf Shabtai and Maya Bercovitch and Lior Rokach and
                 Yuval Elovici",
  title =        "Optimizing Data Misuse Detection",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "16:1--16:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2611520",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data misuse may be performed by entities such as an
                 organization's employees and business partners who are
                 granted access to sensitive information and misuse
                 their privileges. We assume that users can be either
                 trusted or untrusted. The access of untrusted parties
                 to data objects (e.g., client and patient records)
                 should be monitored in an attempt to detect misuse.
                 However, monitoring data objects is resource intensive
                 and time-consuming and may also cause disturbance or
                 inconvenience to the involved employees. Therefore, the
                 monitored data objects should be carefully selected. In
                 this article, we present two optimization problems
                 carefully designed for selecting specific data objects
                 for monitoring, such that the detection rate is
                 maximized and the monitoring effort is minimized. In
                 the first optimization problem, the goal is to select
                 data objects for monitoring that are accessed by at
                 most c trusted agents while ensuring access to at least
                 k monitored objects by each untrusted agent (both c and
                 k are integer variable). As opposed to the first
                 optimization problem, the goal of the second
                 optimization problem is to select monitored data
                 objects that maximize the number of monitored data
                 objects accessed by untrusted agents while ensuring
                 that each trusted agent does not access more than d
                 monitored data objects (d is an integer variable as
                 well). Two efficient heuristic algorithms for solving
                 these optimization problems are proposed, and
                 experiments were conducted simulating different
                 scenarios to evaluate the algorithms' performance.
                 Moreover, we compared the heuristic algorithms'
                 performance to the optimal solution and conducted
                 sensitivity analysis on the three parameters (c, k, and
                 d) and on the ratio between the trusted and untrusted
                 agents.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Hernandez-Orallo:2014:PRC,
  author =       "Jos{\'e} Hern{\'a}ndez-Orallo",
  title =        "Probabilistic Reframing for Cost-Sensitive
                 Regression",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2641758",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:02 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Common-day applications of predictive models usually
                 involve the full use of the available contextual
                 information. When the operating context changes, one
                 may fine-tune the by-default (incontextual) prediction
                 or may even abstain from predicting a value (a reject).
                 Global reframing solutions, where the same function is
                 applied to adapt the estimated outputs to a new cost
                 context, are possible solutions here. An alternative
                 approach, which has not been studied in a comprehensive
                 way for regression in the knowledge discovery and data
                 mining literature, is the use of a local (e.g.,
                 probabilistic) reframing approach, where decisions are
                 made according to the estimated output and a
                 reliability, confidence, or probability estimation. In
                 this article, we advocate for a simple two-parameter
                 (mean and variance) approach, working with a normal
                 conditional probability density. Given the conditional
                 mean produced by any regression technique, we develop
                 lightweight ``enrichment'' methods that produce good
                 estimates of the conditional variance, which are used
                 by the probabilistic (local) reframing methods. We
                 apply these methods to some very common families of
                 cost-sensitive problems, such as optimal predictions in
                 (auction) bids, asymmetric loss scenarios, and
                 rejection rules.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Miettinen:2014:MMD,
  author =       "Pauli Miettinen and Jilles Vreeken",
  title =        "{MDL4BMF}: Minimum Description Length for {Boolean}
                 Matrix Factorization",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2601437",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Matrix factorizations-where a given data matrix is
                 approximated by a product of two or more factor
                 matrices-are powerful data mining tools. Among other
                 tasks, matrix factorizations are often used to separate
                 global structure from noise. This, however, requires
                 solving the ``model order selection problem'' of
                 determining the proper rank of the factorization, that
                 is, to answer where fine-grained structure stops, and
                 where noise starts. Boolean Matrix Factorization
                 (BMF)-where data, factors, and matrix product are
                 Boolean-has in recent years received increased
                 attention from the data mining community. The technique
                 has desirable properties, such as high interpretability
                 and natural sparsity. Yet, so far no method for
                 selecting the correct model order for BMF has been
                 available. In this article, we propose the use of the
                 Minimum Description Length (MDL) principle for this
                 task. Besides solving the problem, this well-founded
                 approach has numerous benefits; for example, it is
                 automatic, does not require a likelihood function, is
                 fast, and, as experiments show, is highly accurate. We
                 formulate the description length function for BMF in
                 general-making it applicable for any BMF algorithm. We
                 discuss how to construct an appropriate encoding:
                 starting from a simple and intuitive approach, we
                 arrive at a highly efficient data-to-model--based
                 encoding for BMF. We extend an existing algorithm for
                 BMF to use MDL to identify the best Boolean matrix
                 factorization, analyze the complexity of the problem,
                 and perform an extensive experimental evaluation to
                 study its behavior.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Tang:2014:FSS,
  author =       "Jiliang Tang and Huan Liu",
  title =        "Feature Selection for Social Media Data",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629587",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Feature selection is widely used in preparing
                 high-dimensional data for effective data mining. The
                 explosive popularity of social media produces massive
                 and high-dimensional data at an unprecedented rate,
                 presenting new challenges to feature selection. Social
                 media data consists of (1) traditional
                 high-dimensional, attribute-value data such as posts,
                 tweets, comments, and images, and (2) linked data that
                 provides social context for posts and describes the
                 relationships between social media users as well as who
                 generates the posts, and so on. The nature of social
                 media also determines that its data is massive, noisy,
                 and incomplete, which exacerbates the already
                 challenging problem of feature selection. In this
                 article, we study a novel feature selection problem of
                 selecting features for social media data with its
                 social context. In detail, we illustrate the
                 differences between attribute-value data and social
                 media data, investigate if linked data can be exploited
                 in a new feature selection framework by taking
                 advantage of social science theories. We design and
                 conduct experiments on datasets from real-world social
                 media Web sites, and the empirical results demonstrate
                 that the proposed framework can significantly improve
                 the performance of feature selection. Further
                 experiments are conducted to evaluate the effects of
                 user--user and user--post relationships manifested in
                 linked data on feature selection, and research issues
                 for future work will be discussed.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Riondato:2014:EDA,
  author =       "Matteo Riondato and Eli Upfal",
  title =        "Efficient Discovery of Association Rules and Frequent
                 Itemsets through Sampling with Tight Performance
                 Guarantees",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629586",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The tasks of extracting (top- K ) Frequent Itemsets
                 (FIs) and Association Rules (ARs) are fundamental
                 primitives in data mining and database applications.
                 Exact algorithms for these problems exist and are
                 widely used, but their running time is hindered by the
                 need of scanning the entire dataset, possibly multiple
                 times. High-quality approximations of FIs and ARs are
                 sufficient for most practical uses. Sampling techniques
                 can be used for fast discovery of approximate
                 solutions, but works exploring this technique did not
                 provide satisfactory performance guarantees on the
                 quality of the approximation due to the difficulty of
                 bounding the probability of under- or oversampling any
                 one of an unknown number of frequent itemsets. We
                 circumvent this issue by applying the statistical
                 concept of Vapnik--Chervonenkis (VC) dimension to
                 develop a novel technique for providing tight bounds on
                 the sample size that guarantees approximation of the
                 (top- K ) FIs and ARs within user-specified parameters.
                 The resulting sample size is linearly dependent on the
                 VC-dimension of a range space associated with the
                 dataset. We analyze the VC-dimension of this range
                 space and show that it is upper bounded by an
                 easy-to-compute characteristic quantity of the dataset,
                 the d-index, namely, the maximum integer d such that
                 the dataset contains at least d transactions of length
                 at least d such that no one of them is a superset of or
                 equal to another. We show that this bound is tight for
                 a large class of datasets. The resulting sample size is
                 a significant improvement over previous known results.
                 We present an extensive experimental evaluation of our
                 technique on real and artificial datasets,
                 demonstrating the practicality of our methods, and
                 showing that they achieve even higher quality
                 approximations than what is guaranteed by the
                 analysis.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Burton:2014:DSC,
  author =       "Scott H. Burton and Christophe G. Giraud-Carrier",
  title =        "Discovering Social Circles in Directed Graphs",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2641759",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:02 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We examine the problem of identifying social circles,
                 or sets of cohesive and mutually aware nodes
                 surrounding an initial query set, in directed graphs
                 where the complete graph is not known beforehand. This
                 problem differs from local community mining, in that
                 the query set defines the circle of interest. We
                 explicitly handle edge direction, as in many cases
                 relationships are not symmetric, and focus on the local
                 context because many real-world graphs cannot be
                 feasibly known. We outline several issues that are
                 unique to this context, introduce a quality function to
                 measure the value of including a particular node in an
                 emerging social circle, and describe a greedy social
                 circle discovery algorithm. We demonstrate the
                 effectiveness of this approach on artificial
                 benchmarks, large networks with topical community
                 labels, and several real-world case studies.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Paul:2014:RPL,
  author =       "Saurabh Paul and Christos Boutsidis and Malik
                 Magdon-Ismail and Petros Drineas",
  title =        "Random Projections for Linear Support Vector
                 Machines",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "22:1--22:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2641760",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Let $X$ be a data matrix of rank $ \rho $, whose rows
                 represent $n$ points in $d$-dimensional space. The
                 linear support vector machine constructs a hyperplane
                 separator that maximizes the 1-norm soft margin. We
                 develop a new oblivious dimension reduction technique
                 that is precomputed and can be applied to any input
                 matrix $X$. We prove that, with high probability, the
                 margin and minimum enclosing ball in the feature space
                 are preserved to within $ \epsilon $-relative error,
                 ensuring comparable generalization as in the original
                 space in the case of classification. For regression, we
                 show that the margin is preserved to $ \epsilon
                 $-relative error with high probability. We present
                 extensive experiments with real and synthetic data to
                 support our theory.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Erdo:2014:RGN,
  author =       "D{\'o}ra Erd{\H{o}}s and Rainer Gemulla and Evimaria
                 Terzi",
  title =        "Reconstructing Graphs from Neighborhood Data",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2641761",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:02 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Consider a social network and suppose that we are only
                 given the number of common friends between each pair of
                 users. Can we reconstruct the underlying network?
                 Similarly, consider a set of documents and the words
                 that appear in them. If we only know the number of
                 common words for every pair of documents, as well as
                 the number of common documents for every pair of words,
                 can we infer which words appear in which documents? In
                 this article, we develop a general methodology for
                 answering questions like these. We formalize these
                 questions in what we call the {\em R}econstruct
                 problem: given information about the common neighbors
                 of nodes in a network, our goal is to reconstruct the
                 hidden binary matrix that indicates the presence or
                 absence of relationships between individual nodes. In
                 fact, we propose two different variants of this
                 problem: one where the number of connections of every
                 node (i.e., the degree of every node) is known and a
                 second one where it is unknown. We call these variants
                 the degree-aware and the degree-oblivious versions of
                 the Reconstruct problem, respectively. Our algorithms
                 for both variants exploit the properties of the
                 singular value decomposition of the hidden binary
                 matrix. More specifically, we show that using the
                 available neighborhood information, we can reconstruct
                 the hidden matrix by finding the components of its
                 singular value decomposition and then combining them
                 appropriately. Our extensive experimental study
                 suggests that our methods are able to reconstruct
                 binary matrices of different characteristics with up to
                 100\% accuracy.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Acharya:2014:OFC,
  author =       "Ayan Acharya and Eduardo R. Hruschka and Joydeep Ghosh
                 and Sreangsu Acharyya",
  title =        "An Optimization Framework for Combining Ensembles of
                 Classifiers and Clusterers with Applications to
                 Nontransductive Semisupervised Learning and Transfer
                 Learning",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2601435",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Unsupervised models can provide supplementary soft
                 constraints to help classify new ``target'' data
                 because similar instances in the target set are more
                 likely to share the same class label. Such models can
                 also help detect possible differences between training
                 and target distributions, which is useful in
                 applications where concept drift may take place, as in
                 transfer learning settings. This article describes a
                 general optimization framework that takes as input
                 class membership estimates from existing classifiers
                 learned on previously encountered ``source'' (or
                 training) data, as well as a similarity matrix from a
                 cluster ensemble operating solely on the target (or
                 test) data to be classified, and yields a consensus
                 labeling of the target data. More precisely, the
                 application settings considered are nontransductive
                 semisupervised and transfer learning scenarios where
                 the training data are used only to build an ensemble of
                 classifiers and are subsequently discarded before
                 classifying the target data. The framework admits a
                 wide range of loss functions and
                 classification/clustering methods. It exploits
                 properties of Bregman divergences in conjunction with
                 Legendre duality to yield a principled and scalable
                 approach. A variety of experiments show that the
                 proposed framework can yield results substantially
                 superior to those provided by na{\"\i}vely applying
                 classifiers learned on the original task to the target
                 data. In addition, we show that the proposed approach,
                 even not being conceptually transductive, can provide
                 better results compared to some popular transductive
                 learning techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Boedihardjo:2014:FEL,
  author =       "Arnold P. Boedihardjo and Chang-Tien Lu and Bingsheng
                 Wang",
  title =        "A Framework for Exploiting Local Information to
                 Enhance Density Estimation of Data Streams",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629618",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The Probability Density Function (PDF) is the
                 fundamental data model for a variety of stream mining
                 algorithms. Existing works apply the standard
                 nonparametric Kernel Density Estimator (KDE) to
                 approximate the PDF of data streams. As a result, the
                 stream-based KDEs cannot accurately capture complex
                 local density features. In this article, we propose the
                 use of Local Region (LRs) to model local density
                 information in univariate data streams. In-depth
                 theoretical analyses are presented to justify the
                 effectiveness of the LR-based KDE. Based on the
                 analyses, we develop the General Local rEgion AlgorithM
                 (GLEAM) to enhance the estimation quality of
                 structurally complex univariate distributions for
                 existing stream-based KDEs. A set of algorithmic
                 optimizations is designed to improve the query
                 throughput of GLEAM and to achieve its linear order
                 computation. Additionally, a comprehensive suite of
                 experiments was conducted to test the effectiveness and
                 efficiency of GLEAM.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Ordonez:2014:BVS,
  author =       "Carlos Ordonez and Carlos Garcia-Alvarado and
                 Veerabhadaran Baladandayuthapani",
  title =        "{Bayesian} Variable Selection in Linear Regression in
                 One Pass for Large Datasets",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629617",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Bayesian models are generally computed with Markov
                 Chain Monte Carlo (MCMC) methods. The main disadvantage
                 of MCMC methods is the large number of iterations they
                 need to sample the posterior distributions of model
                 parameters, especially for large datasets. On the other
                 hand, variable selection remains a challenging problem
                 due to its combinatorial search space, where Bayesian
                 models are a promising solution. In this work, we study
                 how to accelerate Bayesian model computation for
                 variable selection in linear regression. We propose a
                 fast Gibbs sampler algorithm, a widely used MCMC method
                 that incorporates several optimizations. We use a
                 Zellner prior for the regression coefficients, an
                 improper prior on variance, and a conjugate prior
                 Gaussian distribution, which enable dataset
                 summarization in one pass, thus exploiting an augmented
                 set of sufficient statistics. Thereafter, the algorithm
                 iterates in main memory. Sufficient statistics are
                 indexed with a sparse binary vector to efficiently
                 compute matrix projections based on selected variables.
                 Discovered variable subsets probabilities, selecting
                 and discarding each variable, are stored on a hash
                 table for fast retrieval in future iterations. We study
                 how to integrate our algorithm into a Database
                 Management System (DBMS), exploiting aggregate
                 User-Defined Functions for parallel data summarization
                 and stored procedures to manipulate matrices with
                 arrays. An experimental evaluation with real datasets
                 evaluates accuracy and time performance, comparing our
                 DBMS-based algorithm with the R package. Our algorithm
                 is shown to produce accurate results, scale linearly on
                 dataset size, and run orders of magnitude faster than
                 the R package.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Fei:2014:SSB,
  author =       "Hongliang Fei and Jun Huan",
  title =        "Structured Sparse Boosting for Graph Classification",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629328",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Boosting is a highly effective algorithm that produces
                 a linear combination of weak classifiers (a.k.a. base
                 learners) to obtain high-quality classification models.
                 In this article, we propose a generalized logit boost
                 algorithm in which base learners have structural
                 relationships in the functional space. Although such
                 relationships are generic, our work is particularly
                 motivated by the emerging topic of pattern-based
                 classification for semistructured data including
                 graphs. Toward an efficient incorporation of the
                 structure information, we have designed a general model
                 in which we use an undirected graph to capture the
                 relationship of subgraph-based base learners. In our
                 method, we employ both L$_1$ and Laplacian-based L$_2$
                 regularization to logit boosting to achieve model
                 sparsity and smoothness in the functional space spanned
                 by the base learners. We have derived efficient
                 optimization algorithms based on coordinate descent for
                 the new boosting formulation and theoretically prove
                 that it exhibits a natural grouping effect for nearby
                 spatial or overlapping base learners and that the
                 resulting estimator is consistent. Additionally,
                 motivated by the connection between logit boosting and
                 logistic regression, we extend our structured sparse
                 regularization framework to logistic regression for
                 vectorial data in which features are structured. Using
                 comprehensive experimental study and comparing our work
                 with the state-of-the-art, we have demonstrated the
                 effectiveness of the proposed learning method.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Xu:2014:GGB,
  author =       "Zhiqiang Xu and Yiping Ke and Yi Wang and Hong Cheng
                 and James Cheng",
  title =        "{GBAGC}: a General {Bayesian} Framework for Attributed
                 Graph Clustering",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629616",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Graph clustering, also known as community detection,
                 is a long-standing problem in data mining. In recent
                 years, with the proliferation of rich attribute
                 information available for objects in real-world graphs,
                 how to leverage not only structural but also attribute
                 information for clustering attributed graphs becomes a
                 new challenge. Most existing works took a
                 distance-based approach. They proposed various distance
                 measures to fuse structural and attribute information
                 and then applied standard techniques for graph
                 clustering based on these distance measures. In this
                 article, we take an alternative view and propose a
                 novel Bayesian framework for attributed graph
                 clustering. Our framework provides a general and
                 principled solution to modeling both the structural and
                 the attribute aspects of a graph. It avoids the
                 artificial design of a distance measure in existing
                 methods and, furthermore, can seamlessly handle graphs
                 with different types of edges and vertex attributes. We
                 develop an efficient variational method for graph
                 clustering under this framework and derive two concrete
                 algorithms for clustering unweighted and weighted
                 attributed graphs. Experimental results on large
                 real-world datasets show that our algorithms
                 significantly outperform the state-of-the-art
                 distance-based method, in terms of both effectiveness
                 and efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Coscia:2014:UHO,
  author =       "Michele Coscia and Giulio Rossetti and Fosca Giannotti
                 and Dino Pedreschi",
  title =        "Uncovering Hierarchical and Overlapping Communities
                 with a Local-First Approach",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629511",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community discovery in complex networks is the task of
                 organizing a network's structure by grouping together
                 nodes related to each other. Traditional approaches are
                 based on the assumption that there is a global-level
                 organization in the network. However, in many
                 scenarios, each node is the bearer of complex
                 information and cannot be classified in disjoint
                 clusters. The top-down global view of the partition
                 approach is not designed for this. Here, we represent
                 this complex information as multiple latent labels, and
                 we postulate that edges in the networks are created
                 among nodes carrying similar labels. The latent labels
                 are the communities a node belongs to and we discover
                 them with a simple local-first approach to community
                 discovery. This is achieved by democratically letting
                 each node vote for the communities it sees surrounding
                 it in its limited view of the global system, its ego
                 neighborhood, using a label propagation algorithm,
                 assuming that each node is aware of the label it shares
                 with each of its connections. The local communities are
                 merged hierarchically, unveiling the modular
                 organization of the network at the global level and
                 identifying overlapping groups and groups of groups. We
                 tested this intuition against the state-of-the-art
                 overlapping community discovery and found that our new
                 method advances in the chosen scenarios in the quality
                 of the obtained communities. We perform a test on
                 benchmark and on real-world networks, evaluating the
                 quality of the community coverage by using the
                 extracted communities to predict the metadata attached
                 to the nodes, which we consider external information
                 about the latent labels. We also provide an explanation
                 about why real-world networks contain overlapping
                 communities and how our logic is able to capture them.
                 Finally, we show how our method is deterministic, is
                 incremental, and has a limited time complexity, so that
                 it can be used on real-world scale networks.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2014:GML,
  author =       "Guangtao Wang and Qinbao Song and Xueying Zhang and
                 Kaiyuan Zhang",
  title =        "A Generic Multilabel Learning-Based Classification
                 Algorithm Recommendation Method",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "7:1--7:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629474",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Oct 10 17:19:10 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "As more and more classification algorithms continue to
                 be developed, recommending appropriate algorithms to a
                 given classification problem is increasingly important.
                 This article first distinguishes the algorithm
                 recommendation methods by two dimensions: (1)
                 meta-features, which are a set of measures used to
                 characterize the learning problems, and (2)
                 meta-target, which represents the relative performance
                 of the classification algorithms on the learning
                 problem. In contrast to the existing algorithm
                 recommendation methods whose meta-target is usually in
                 the form of either the ranking of candidate algorithms
                 or a single algorithm, this article proposes a new and
                 natural multilabel form to describe the meta-target.
                 This is due to the fact that there would be multiple
                 algorithms being appropriate for a given problem in
                 practice. Furthermore, a novel multilabel
                 learning-based generic algorithm recommendation method
                 is proposed, which views the algorithm recommendation
                 as a multilabel learning problem and solves the problem
                 by the mature multilabel learning algorithms. To
                 evaluate the proposed multilabel learning-based
                 recommendation method, extensive experiments with 13
                 well-known classification algorithms, two kinds of
                 meta-targets such as algorithm ranking and single
                 algorithm, and five different kinds of meta-features
                 are conducted on 1,090 benchmark learning problems. The
                 results show the effectiveness of our proposed
                 multilabel learning-based recommendation method.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Wang:2014:EEM,
  author =       "Pinghui Wang and John C. S. Lui and Bruno Ribeiro and
                 Don Towsley and Junzhou Zhao and Xiaohong Guan",
  title =        "Efficiently Estimating Motif Statistics of Large
                 Networks",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629564",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Exploring statistics of locally connected subgraph
                 patterns (also known as network motifs) has helped
                 researchers better understand the structure and
                 function of biological and Online Social Networks
                 (OSNs). Nowadays, the massive size of some critical
                 networks-often stored in already overloaded relational
                 databases-effectively limits the rate at which nodes
                 and edges can be explored, making it a challenge to
                 accurately discover subgraph statistics. In this work,
                 we propose sampling methods to accurately estimate
                 subgraph statistics from as few queried nodes as
                 possible. We present sampling algorithms that
                 efficiently and accurately estimate subgraph properties
                 of massive networks. Our algorithms require no
                 precomputation or complete network topology
                 information. At the same time, we provide theoretical
                 guarantees of convergence. We perform experiments using
                 widely known datasets and show that, for the same
                 accuracy, our algorithms require an order of magnitude
                 less queries (samples) than the current
                 state-of-the-art algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Zheng:2014:FHE,
  author =       "Li Zheng and Tao Li and Chris Ding",
  title =        "A Framework for Hierarchical Ensemble Clustering",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2611380",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Ensemble clustering, as an important extension of the
                 clustering problem, refers to the problem of combining
                 different (input) clusterings of a given dataset to
                 generate a final (consensus) clustering that is a
                 better fit in some sense than existing clusterings.
                 Over the past few years, many ensemble clustering
                 approaches have been developed. However, most of them
                 are designed for partitional clustering methods, and
                 few research efforts have been reported for ensemble
                 hierarchical clustering methods. In this article, a
                 hierarchical ensemble clustering framework that can
                 naturally combine both partitional clustering and
                 hierarchical clustering results is proposed. In
                 addition, a novel method for learning the ultra-metric
                 distance from the aggregated distance matrices and
                 generating final hierarchical clustering with enhanced
                 cluster separation is developed based on the
                 ultra-metric distance for hierarchical clustering. We
                 study three important problems: dendrogram description,
                 dendrogram combination, and dendrogram selection. We
                 develop two approaches for dendrogram selection based
                 on tree distances, and we investigate various
                 dendrogram distances for representing dendrograms. We
                 provide a systematic empirical study of the ensemble
                 hierarchical clustering problem. Experimental results
                 demonstrate the effectiveness of our proposed
                 approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Huai:2014:TPC,
  author =       "Baoxing Huai and Enhong Chen and Hengshu Zhu and Hui
                 Xiong and Tengfei Bao and Qi Liu and Jilei Tian",
  title =        "Toward Personalized Context Recognition for Mobile
                 Users: a Semisupervised {Bayesian} {HMM} Approach",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629504",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The problem of mobile context recognition targets the
                 identification of semantic meaning of context in a
                 mobile environment. This plays an important role in
                 understanding mobile user behaviors and thus provides
                 the opportunity for the development of better
                 intelligent context-aware services. A key step of
                 context recognition is to model the personalized
                 contextual information of mobile users. Although many
                 studies have been devoted to mobile context modeling,
                 limited efforts have been made on the exploitation of
                 the sequential and dependency characteristics of mobile
                 contextual information. Also, the latent semantics
                 behind mobile context are often ambiguous and poorly
                 understood. Indeed, a promising direction is to
                 incorporate some domain knowledge of common contexts,
                 such as ``waiting for a bus'' or ``having dinner,'' by
                 modeling both labeled and unlabeled context data from
                 mobile users because there are often few labeled
                 contexts available in practice. To this end, in this
                 article, we propose a sequence-based semisupervised
                 approach to modeling personalized context for mobile
                 users. Specifically, we first exploit the Bayesian
                 Hidden Markov Model (B-HMM) for modeling context in the
                 form of probabilistic distributions and transitions of
                 raw context data. Also, we propose a sequential model
                 by extending B-HMM with the prior knowledge of
                 contextual features to model context more accurately.
                 Then, to efficiently learn the parameters and initial
                 values of the proposed models, we develop a novel
                 approach for parameter estimation by integrating the
                 Dirichlet Process Mixture (DPM) model and the Mixture
                 Unigram (MU) model. Furthermore, by incorporating both
                 user-labeled and unlabeled data, we propose a
                 semisupervised learning-based algorithm to identify and
                 model the latent semantics of context. Finally,
                 experimental results on real-world data clearly
                 validate both the efficiency and effectiveness of the
                 proposed approaches for recognizing personalized
                 context of mobile users.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Liu:2014:ADI,
  author =       "Siyuan Liu and Lei Chen and Lionel M. Ni",
  title =        "Anomaly Detection from Incomplete Data",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629668",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Anomaly detection (a.k.a., outlier or burst detection)
                 is a well-motivated problem and a major data mining and
                 knowledge discovery task. In this article, we study the
                 problem of population anomaly detection, one of the key
                 issues related to event monitoring and population
                 management within a city. Through studying detected
                 population anomalies, we can trace and analyze these
                 anomalies, which could help to model city traffic
                 design and event impact analysis and prediction.
                 Although a significant and interesting issue, it is
                 very hard to detect population anomalies and retrieve
                 anomaly trajectories, especially given that it is
                 difficult to get actual and sufficient population data.
                 To address the difficulties of a lack of real
                 population data, we take advantage of mobile phone
                 networks, which offer enormous spatial and temporal
                 communication data on persons. More importantly, we
                 claim that we can utilize these mobile phone data to
                 infer and approximate population data. Thus, we can
                 study the population anomaly detection problem by
                 taking advantages of unique features hidden in mobile
                 phone data. In this article, we present a system to
                 conduct Population Anomaly Detection (PAD). First, we
                 propose an effective clustering method,
                 correlation-based clustering, to cluster the incomplete
                 location information from mobile phone data (i.e., from
                 mobile call volume distribution to population density
                 distribution). Then, we design an adaptive
                 parameter-free detection method, R-scan, to capture the
                 distributed dynamic anomalies. Finally, we devise an
                 efficient algorithm, BT-miner, to retrieve anomaly
                 trajectories. The experimental results from real-life
                 mobile phone data confirm the effectiveness and
                 efficiency of the proposed algorithms. Finally, the
                 proposed methods are realized as a pilot system in a
                 city in China.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Gundecha:2014:UVR,
  author =       "Pritam Gundecha and Geoffrey Barbier and Jiliang Tang
                 and Huan Liu",
  title =        "User Vulnerability and Its Reduction on a Social
                 Networking Site",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2630421",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Privacy and security are major concerns for many users
                 of social media. When users share information (e.g.,
                 data and photos) with friends, they can make their
                 friends vulnerable to security and privacy breaches
                 with dire consequences. With the continuous expansion
                 of a user's social network, privacy settings alone are
                 often inadequate to protect a user's profile. In this
                 research, we aim to address some critical issues
                 related to privacy protection: (1) How can we measure
                 and assess individual users' vulnerability? (2) With
                 the diversity of one's social network friends, how can
                 one figure out an effective approach to maintaining
                 balance between vulnerability and social utility? In
                 this work, first we present a novel way to define
                 vulnerable friends from an individual user's
                 perspective. User vulnerability is dependent on whether
                 or not the user's friends' privacy settings protect the
                 friend and the individual's network of friends (which
                 includes the user). We show that it is feasible to
                 measure and assess user vulnerability and reduce one's
                 vulnerability without changing the structure of a
                 social networking site. The approach is to unfriend
                 one's most vulnerable friends. However, when such a
                 vulnerable friend is also socially important,
                 unfriending him or her would significantly reduce one's
                 own social status. We formulate this novel problem as
                 vulnerability minimization with social utility
                 constraints. We formally define the optimization
                 problem and provide an approximation algorithm with a
                 proven bound. Finally, we conduct a large-scale
                 evaluation of a new framework using a Facebook dataset.
                 We resort to experiments and observe how much
                 vulnerability an individual user can be decreased by
                 unfriending a vulnerable friend. We compare performance
                 of different unfriending strategies and discuss the
                 security risk of new friend requests. Additionally, by
                 employing different forms of social utility, we confirm
                 that the balance between user vulnerability and social
                 utility can be practically achieved.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Duan:2014:SRC,
  author =       "Lian Duan and W. Nick Street and Yanchi Liu and
                 Songhua Xu and Brook Wu",
  title =        "Selecting the Right Correlation Measure for Binary
                 Data",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2637484",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Finding the most interesting correlations among items
                 is essential for problems in many commercial, medical,
                 and scientific domains. Although there are numerous
                 measures available for evaluating correlations,
                 different correlation measures provide drastically
                 different results. Piatetsky-Shapiro provided three
                 mandatory properties for any reasonable correlation
                 measure, and Tan et al. proposed several properties to
                 categorize correlation measures; however, it is still
                 hard for users to choose the desirable correlation
                 measures according to their needs. In order to solve
                 this problem, we explore the effectiveness problem in
                 three ways. First, we propose two desirable properties
                 and two optional properties for correlation measure
                 selection and study the property satisfaction for
                 different correlation measures. Second, we study
                 different techniques to adjust correlation measures and
                 propose two new correlation measures: the Simplified $
                 \chi^2 $ with Continuity Correction and the Simplified
                 $ \chi^2 $ with Support. Third, we analyze the upper
                 and lower bounds of different measures and categorize
                 them by the bound differences. Combining these three
                 directions, we provide guidelines for users to choose
                 the proper measure according to their needs.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

@Article{Huang:2014:PBA,
  author =       "Hao Huang and Hong Qin and Shinjae Yoo and Dantong
                 Yu",
  title =        "Physics-Based Anomaly Detection Defined on Manifold
                 Space",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2641574",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Current popular anomaly detection algorithms are
                 capable of detecting global anomalies but often fail to
                 distinguish local anomalies from normal instances.
                 Inspired by contemporary physics theory (i.e., heat
                 diffusion and quantum mechanics), we propose two
                 unsupervised anomaly detection algorithms. Building on
                 the embedding manifold derived from heat diffusion, we
                 devise Local Anomaly Descriptor (LAD), which faithfully
                 reveals the intrinsic neighborhood density. It uses a
                 scale-dependent umbrella operator to bridge global and
                 local properties, which makes LAD more informative
                 within an adaptive scope of neighborhood. To offer more
                 stability of local density measurement on scaling
                 parameter tuning, we formulate Fermi Density Descriptor
                 (FDD), which measures the probability of a fermion
                 particle being at a specific location. By choosing the
                 stable energy distribution function, FDD steadily
                 distinguishes anomalies from normal instances with any
                 scaling parameter setting. To further enhance the
                 efficacy of our proposed algorithms, we explore the
                 utility of anisotropic Gaussian kernel (AGK), which
                 offers better manifold-aware affinity information. We
                 also quantify and examine the effect of different
                 Laplacian normalizations for anomaly detection.
                 Comprehensive experiments on both synthetic and
                 benchmark datasets verify that our proposed algorithms
                 outperform the existing anomaly detection algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}

%% TO DO: [10-Oct-2014] check for more papers in incomplete v8n4, v9n1, and v9n2