%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%% author = "Nelson H. F. Beebe",
%%% version = "1.21",
%%% date = "28 July 2015",
%%% time = "17:20:30 MDT",
%%% filename = "tkdd.bib",
%%% address = "University of Utah
%%% Department of Mathematics, 110 LCB
%%% 155 S 1400 E RM 233
%%% Salt Lake City, UT 84112-0090
%%% USA",
%%% telephone = "+1 801 581 5254",
%%% FAX = "+1 801 581 4148",
%%% URL = "http://www.math.utah.edu/~beebe",
%%% checksum = "55641 9977 55638 530711",
%%% email = "beebe at math.utah.edu, beebe at acm.org,
%%% beebe at computer.org (Internet)",
%%% codetable = "ISO/ASCII",
%%% keywords = "ACM Transactions on Knowledge Discovery from
%%% Data (TKDD); bibliography; TKDD",
%%% license = "public domain",
%%% supported = "yes",
%%% docstring = "This is a COMPLETE BibTeX bibliography for
%%% ACM Transactions on Knowledge Discovery from
%%% Data (TKDD) (CODEN ????, ISSN 1556-4681),
%%% covering all journal issues from 2007 --
%%% date.
%%%
%%% At version 1.21, the COMPLETE journal
%%% coverage looked like this:
%%%
%%% 2007 ( 14) 2010 ( 26) 2013 ( 20)
%%% 2008 ( 18) 2011 ( 11) 2014 ( 37)
%%% 2009 ( 25) 2012 ( 26) 2015 ( 31)
%%%
%%% Article: 208
%%%
%%% Total entries: 208
%%%
%%% The journal Web page can be found at:
%%%
%%% http://www.acm.org/pubs/tkdd.html
%%%
%%% The journal table of contents page is at:
%%%
%%% http://www.acm.org/tkdd/
%%% http://portal.acm.org/browse_dl.cfm?idx=J1054
%%%
%%% Qualified subscribers can retrieve the full
%%% text of recent articles in PDF form.
%%%
%%% The initial draft was extracted from the ACM
%%% Web pages.
%%%
%%% ACM copyrights explicitly permit abstracting
%%% with credit, so article abstracts, keywords,
%%% and subject classifications have been
%%% included in this bibliography wherever
%%% available. Article reviews have been
%%% omitted, until their copyright status has
%%% been clarified.
%%%
%%% bibsource keys in the bibliography entries
%%% below indicate the entry originally came
%%% from the computer science bibliography
%%% archive, even though it has likely since
%%% been corrected and updated.
%%%
%%% URL keys in the bibliography point to
%%% World Wide Web locations of additional
%%% information about the entry.
%%%
%%% BibTeX citation tags are uniformly chosen
%%% as name:year:abbrev, where name is the
%%% family name of the first author or editor,
%%% year is a 4-digit number, and abbrev is a
%%% 3-letter condensation of important title
%%% words. Citation tags were automatically
%%% generated by software developed for the
%%% BibNet Project.
%%%
%%% In this bibliography, entries are sorted in
%%% publication order, using ``bibsort -byvolume.''
%%%
%%% The checksum field above contains a CRC-16
%%% checksum as the first value, followed by the
%%% equivalent of the standard UNIX wc (word
%%% count) utility output of lines, words, and
%%% characters. This is produced by Robert
%%% Solovay's checksum utility."
%%% }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}
%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
%%% ====================================================================
%%% Journal abbreviations:
@String{j-TKDD = "ACM Transactions on Knowledge
Discovery from Data (TKDD)"}
%%% ====================================================================
%%% Bibliography entries:
@Article{Han:2007:I,
author = "Jiawei Han",
title = "Introduction",
journal = j-TKDD,
volume = "1",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1217299.1217300",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:36 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "1",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Leskovec:2007:GED,
author = "Jure Leskovec and Jon Kleinberg and Christos
Faloutsos",
title = "Graph evolution: {Densification} and shrinking
diameters",
journal = j-TKDD,
volume = "1",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1217299.1217301",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:36 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "How do real graphs evolve over time? What are normal
growth patterns in social, technological, and
information networks? Many studies have discovered
patterns in {\em static graphs}, identifying properties
in a single snapshot of a large network or in a very
small number of snapshots; these include heavy tails
for in- and out-degree distributions, communities,
small-world phenomena, and others. However, given the
lack of information about network evolution over long
periods, it has been hard to convert these findings
into statements about trends over time.\par
Here we study a wide range of real graphs, and we
observe some surprising phenomena. First, most of these
graphs densify over time with the number of edges
growing superlinearly in the number of nodes. Second,
the average distance between nodes often shrinks over
time in contrast to the conventional wisdom that such
distance parameters should increase slowly as a
function of the number of nodes (like $O(\log n)$ or
$O(\log(\log n))$).\par
Existing graph generation models do not exhibit these
types of behavior even at a qualitative level. We
provide a new graph generator, based on a forest fire
spreading process that has a simple, intuitive
justification, requires very few parameters (like the
flammability of nodes), and produces graphs exhibiting
the full range of properties observed both in prior
work and in the present study.\par
We also notice that the forest fire model exhibits a
sharp transition between sparse graphs and graphs that
are densifying. Graphs with decreasing distance between
the nodes are generated around this transition
point.\par
Last, we analyze the connection between the temporal
evolution of the degree distribution and densification
of a graph. We find that the two are fundamentally
related. We also observe that real networks exhibit
this type of relation between densification and the
degree distribution.",
acknowledgement = ack-nhfb,
articleno = "2",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Densification power laws; graph generators; graph
mining; heavy-tailed distributions; small-world
phenomena",
}
@Article{Machanavajjhala:2007:DPB,
author = "Ashwin Machanavajjhala and Daniel Kifer and Johannes
Gehrke and Muthuramakrishnan Venkitasubramaniam",
title = "{{$L$}}-diversity: {Privacy} beyond $k$-anonymity",
journal = j-TKDD,
volume = "1",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1217299.1217302",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:36 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Publishing data about individuals without revealing
sensitive information about them is an important
problem. In recent years, a new definition of privacy
called $k$-anonymity has gained popularity. In a
$k$-anonymized dataset, each record is
indistinguishable from at least $k - 1$ other records
with respect to certain identifying attributes.\par
In this article, we show using two simple attacks that
a $k$-anonymized dataset has some subtle but severe
privacy problems. First, an attacker can discover the
values of sensitive attributes when there is little
diversity in those sensitive attributes. This is a
known problem. Second, attackers often have background
knowledge, and we show that $k$-anonymity does not
guarantee privacy against attackers using background
knowledge. We give a detailed analysis of these two
attacks, and we propose a novel and powerful privacy
criterion called $\ell$-diversity that can defend
against such attacks. In addition to building a formal
foundation for $\ell$-diversity, we show in an
experimental evaluation that $\ell$-diversity is
practical and can be implemented efficiently.",
acknowledgement = ack-nhfb,
articleno = "3",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "-diversity; Data privacy; ell-k-anonymity;
privacy-preserving data publishing",
}
@Article{Gionis:2007:CA,
author = "Aristides Gionis and Heikki Mannila and Panayiotis
Tsaparas",
title = "Clustering aggregation",
journal = j-TKDD,
volume = "1",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1217299.1217303",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:36 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We consider the following problem: given a set of
clusterings, find a single clustering that agrees as
much as possible with the input clusterings. This
problem, {\em clustering aggregation}, appears
naturally in various contexts. For example, clustering
categorical data is an instance of the clustering
aggregation problem; each categorical attribute can be
viewed as a clustering of the input rows where rows are
grouped together if they take the same value on that
attribute. Clustering aggregation can also be used as a
metaclustering method to improve the robustness of
clustering by combining the output of multiple
algorithms. Furthermore, the problem formulation does
not require a priori information about the number of
clusters; it is naturally determined by the
optimization function.\par
In this article, we give a formal statement of the
clustering aggregation problem, and we propose a number
of algorithms. Our algorithms make use of the
connection between clustering aggregation and the
problem of {\em correlation clustering}. Although the
problems we consider are NP-hard, for several of our
methods, we provide theoretical guarantees on the
quality of the solutions. Our work provides the best
deterministic approximation algorithm for the variation
of the correlation clustering problem we consider. We
also show how sampling can be used to scale the
algorithms for large datasets. We give an extensive
empirical evaluation demonstrating the usefulness of
the problem and of the solutions.",
acknowledgement = ack-nhfb,
articleno = "4",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "clustering aggregation; clustering categorical data;
correlation clustering; Data clustering",
}
@Article{Bhattacharya:2007:CER,
author = "Indrajit Bhattacharya and Lise Getoor",
title = "Collective entity resolution in relational data",
journal = j-TKDD,
volume = "1",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1217299.1217304",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:36 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Many databases contain uncertain and imprecise
references to real-world entities. The absence of
identifiers for the underlying entities often results
in a database which contains multiple references to the
same entity. This can lead not only to data redundancy,
but also inaccuracies in query processing and knowledge
extraction. These problems can be alleviated through
the use of {\em entity resolution}. Entity resolution
involves discovering the underlying entities and
mapping each database reference to these entities.
Traditionally, entities are resolved using pairwise
similarity over the attributes of references. However,
there is often additional relational information in the
data. Specifically, references to different entities
may cooccur. In these cases, collective entity
resolution, in which entities for cooccurring
references are determined jointly rather than
independently, can improve entity resolution accuracy.
We propose a novel relational clustering algorithm that
uses both attribute and relational information for
determining the underlying domain entities, and we give
an efficient implementation. We investigate the impact
that different relational similarity measures have on
entity resolution quality. We evaluate our collective
entity resolution algorithm on multiple real-world
databases. We show that it improves entity resolution
performance over both attribute-based baselines and
over algorithms that consider relational information
but do not resolve entities collectively. In addition,
we perform detailed experiments on synthetically
generated data to identify data characteristics that
favor collective relational resolution over purely
attribute-based algorithms.",
acknowledgement = ack-nhfb,
articleno = "5",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "data cleaning; Entity resolution; graph clustering;
record linkage",
}
@Article{Loh:2007:EEL,
author = "Wei-Yin Loh and Chien-Wei Chen and Wei Zheng",
title = "Extrapolation errors in linear model trees",
journal = j-TKDD,
volume = "1",
number = "2",
pages = "6:1--6:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1267066.1267067",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:48 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Prediction errors from a linear model tend to be
larger when extrapolation is involved, particularly
when the model is wrong. This article considers the
problem of extrapolation and interpolation errors when
a linear model tree is used for prediction. It proposes
several ways to curtail the size of the errors, and
uses a large collection of real datasets to demonstrate
that the solutions are effective in reducing the
average mean squared prediction error. The article also
provides a proof that, if a linear model is correct,
the proposed solutions have no undesirable effects as
the training sample size tends to infinity.",
acknowledgement = ack-nhfb,
articleno = "6",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Decision tree; prediction; regression; statistics",
}
@Article{Zhang:2007:MPP,
author = "Minghua Zhang and Ben Kao and David W. Cheung and
Kevin Y. Yip",
title = "Mining periodic patterns with gap requirement from
sequences",
journal = j-TKDD,
volume = "1",
number = "2",
pages = "7:1--7:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1267066.1267068",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:48 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We study a problem of mining frequently occurring
periodic patterns with a gap requirement from
sequences. Given a character sequence $S$ of length $L$
and a pattern $P$ of length $l$, we consider $P$ a
frequently occurring pattern in $S$ if the probability
of {\em observing\/} $P$ given a randomly picked
length-$l$ subsequence of $S$ exceeds a certain
threshold. In many applications, particularly those
related to bioinformatics, interesting patterns are
{\em periodic\/} with a {\em gap requirement}. That is
to say, the characters in $P$ should match subsequences
of $S$ in such a way that the matching characters in
$S$ are separated by gaps of more or less the same
size. We show the complexity of the mining problem and
discuss why traditional mining algorithms are
computationally infeasible. We propose practical
algorithms for solving the problem and study their
characteristics. We also present a case study in which
we apply our algorithms on some DNA sequences. We
discuss some interesting patterns obtained from the
case study.",
acknowledgement = ack-nhfb,
articleno = "7",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "gap requirement; periodic pattern; Sequence mining",
}
@Article{Huang:2007:TTE,
author = "Jen-Wei Huang and Bi-Ru Dai and Ming-Syan Chen",
title = "{Twain}: {Two-end} association miner with precise
frequent exhibition periods",
journal = j-TKDD,
volume = "1",
number = "2",
pages = "8:1--8:??",
month = aug,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1267066.1267069",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:48 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We investigate the general model of mining
associations in a temporal database, where the
exhibition periods of items are allowed to be different
from one to another. The database is divided into
partitions according to the time granularity imposed.
Such temporal association rules allow us to observe
short-term but interesting patterns that are absent
when the whole range of the database is evaluated
altogether. Prior work may omit some temporal
association rules and thus have limited practicability.
To remedy this and to give more precise frequent
exhibition periods of frequent temporal itemsets, we
devise an efficient algorithm {\em Twain\/} (standing
for {\em TWo end AssocIation miNer\/} .) {\em Twain\/}
not only generates frequent patterns with more precise
frequent exhibition periods, but also discovers more
interesting frequent patterns. {\em Twain\/} employs
Start time and End time of each item to provide precise
frequent exhibition period while progressively handling
itemsets from one partition to another. Along with one
scan of the database, {\em Twain\/} can generate
frequent 2-itemsets directly according to the
cumulative filtering threshold. Then, {\em Twain\/}
adopts the scan reduction technique to generate all
frequent $k$-itemsets ($k$ > 2) from the generated
frequent 2-itemsets. Theoretical properties of {\em
Twain\/} are derived as well in this article. The
experimental results show that {\em Twain\/}
outperforms the prior works in the quality of frequent
patterns, execution time, I/O cost, CPU overhead and
scalability.",
acknowledgement = ack-nhfb,
articleno = "8",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Association; temporal",
}
@Article{Bayardop:2007:ISI,
author = "Roberto Bayardop and Kristin P. Bennett and Gautam Das
and Dimitrios Gunopulos and Johannes Gunopulos",
title = "Introduction to special issue {ACM SIGKDD 2006}",
journal = j-TKDD,
volume = "1",
number = "3",
pages = "9:1--9:??",
month = dec,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1297332.1297333",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:56 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "9",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Bohm:2007:RPF,
author = "Christian B{\"o}hm and Christos Faloutsos and Jia-Yu
Pan and Claudia Plant",
title = "{RIC}: {Parameter-free} noise-robust clustering",
journal = j-TKDD,
volume = "1",
number = "3",
pages = "10:1--10:??",
month = dec,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1297332.1297334",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:56 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "How do we find a {\em natural\/} clustering of a
real-world point set which contains an unknown number
of clusters with different shapes, and which may be
contaminated by noise? As most clustering algorithms
were designed with certain assumptions (Gaussianity),
they often require the user to give input parameters,
and are sensitive to noise. In this article, we propose
a robust framework for determining a natural clustering
of a given dataset, based on the minimum description
length (MDL) principle. The proposed framework, {\em
robust information-theoretic clustering (RIC)}, is
orthogonal to any known clustering algorithm: Given a
preliminary clustering, RIC purifies these clusters
from noise, and adjusts the clusterings such that it
simultaneously determines the most natural amount and
shape (subspace) of the clusters. Our RIC method can be
combined with any clustering technique ranging from
K-means and K-medoids to advanced methods such as
spectral clustering. In fact, RIC is even able to
purify and improve an initial coarse clustering, even
if we start with very simple methods. In an extension,
we propose a fully automatic stand-alone clustering
method and efficiency improvements. RIC scales well
with the dataset size. Extensive experiments on
synthetic and real-world datasets validate the proposed
RIC framework.",
acknowledgement = ack-nhfb,
articleno = "10",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Clustering; data summarization; noise robustness;
parameter-free data mining",
}
@Article{Mei:2007:SAF,
author = "Qiaozhu Mei and Dong Xin and Hong Cheng and Jiawei Han
and Chengxiang Zhai",
title = "Semantic annotation of frequent patterns",
journal = j-TKDD,
volume = "1",
number = "3",
pages = "11:1--11:??",
month = dec,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1297332.1297335",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:56 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Using frequent patterns to analyze data has been one
of the fundamental approaches in many data mining
applications. Research in frequent pattern mining has
so far mostly focused on developing efficient
algorithms to discover various kinds of frequent
patterns, but little attention has been paid to the
important next step --- interpreting the discovered
frequent patterns. Although the compression and
summarization of frequent patterns has been studied in
some recent work, the proposed techniques there can
only annotate a frequent pattern with nonsemantical
information (e.g., support), which provides only
limited help for a user to understand the
patterns.\par
In this article, we study the novel problem of
generating semantic annotations for frequent patterns.
The goal is to discover the hidden meanings of a
frequent pattern by annotating it with in-depth,
concise, and structured information. We propose a
general approach to generate such an annotation for a
frequent pattern by constructing its context model,
selecting informative context indicators, and
extracting representative transactions and semantically
similar patterns. This general approach can well
incorporate the user's prior knowledge, and has
potentially many applications, such as generating a
dictionary-like description for a pattern, finding
synonym patterns, discovering semantic relations, and
summarizing semantic classes of a set of frequent
patterns. Experiments on different datasets show that
our approach is effective in generating semantic
pattern annotations.",
acknowledgement = ack-nhfb,
articleno = "11",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Frequent pattern; pattern annotation; pattern context;
pattern semantic analysis",
}
@Article{Koren:2007:MEP,
author = "Yehuda Koren and Stephen C. North and Chris Volinsky",
title = "Measuring and extracting proximity graphs in
networks",
journal = j-TKDD,
volume = "1",
number = "3",
pages = "12:1--12:??",
month = dec,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1297332.1297336",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:56 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Measuring distance or some other form of proximity
between objects is a standard data mining tool.
Connection subgraphs were recently proposed as a way to
demonstrate proximity between nodes in networks. We
propose a new way of measuring and extracting proximity
in networks called ``cycle-free effective conductance''
(CFEC). Importantly, the measured proximity is
accompanied with a {\em proximity subgraph\/} which
allows assessing and understanding measured values. Our
proximity calculation can handle more than two
endpoints, directed edges, is statistically well
behaved, and produces an effectiveness score for the
computed subgraphs. We provide an efficient algorithm
to measure and extract proximity. Also, we report
experimental results and show examples for four large
network datasets: a telecommunications calling graph,
the IMDB actors graph, an academic coauthorship
network, and a movie recommendation system.",
acknowledgement = ack-nhfb,
articleno = "12",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Connection subgraph; cycle-free escape probability;
escape probability; graph mining; proximity; proximity
subgraph; random walk",
}
@Article{Ihler:2007:LDE,
author = "Alexander Ihler and Jon Hutchins and Padhraic Smyth",
title = "Learning to detect events with {Markov}-modulated
{Poisson} processes",
journal = j-TKDD,
volume = "1",
number = "3",
pages = "13:1--13:??",
month = dec,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1297332.1297337",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:56 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Time-series of count data occur in many different
contexts, including Internet navigation logs, freeway
traffic monitoring, and security logs associated with
buildings. In this article we describe a framework for
detecting anomalous events in such data using an
unsupervised learning approach. Normal periodic
behavior is modeled via a time-varying Poisson process
model, which in turn is modulated by a hidden Markov
process that accounts for bursty events. We outline a
Bayesian framework for learning the parameters of this
model from count time-series. Two large real-world
datasets of time-series counts are used as testbeds to
validate the approach, consisting of freeway traffic
data and logs of people entering and exiting a
building. We show that the proposed model is
significantly more accurate at detecting known events
than a more traditional threshold-based technique. We
also describe how the model can be used to investigate
different degrees of periodicity in the data, including
systematic day-of-week and time-of-day effects, and to
make inferences about different aspects of events such
as number of vehicles or people involved. The results
indicate that the Markov-modulated Poisson framework
provides a robust and accurate framework for adaptively
and autonomously learning how to separate unusual
bursty events from traces of normal human activity.",
acknowledgement = ack-nhfb,
articleno = "13",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Event detection; Markov modulated; Poisson",
}
@Article{Gionis:2007:ADM,
author = "Aristides Gionis and Heikki Mannila and Taneli
Mielik{\"a}inen and Panayiotis Tsaparas",
title = "Assessing data mining results via swap randomization",
journal = j-TKDD,
volume = "1",
number = "3",
pages = "14:1--14:??",
month = dec,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1297332.1297338",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:58:56 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The problem of assessing the significance of data
mining results on high-dimensional 0--1 datasets has
been studied extensively in the literature. For
problems such as mining frequent sets and finding
correlations, significance testing can be done by
standard statistical tests such as chi-square, or other
methods. However, the results of such tests depend only
on the specific attributes and not on the dataset as a
whole. Moreover, the tests are difficult to apply to
sets of patterns or other complex results of data
mining algorithms. In this article, we consider a
simple randomization technique that deals with this
shortcoming. The approach consists of producing random
datasets that have the same row and column margins as
the given dataset, computing the results of interest on
the randomized instances and comparing them to the
results on the actual data. This randomization
technique can be used to assess the results of many
different types of data mining algorithms, such as
frequent sets, clustering, and spectral analysis. To
generate random datasets with given margins, we use
variations of a Markov chain approach which is based on
a simple swap operation. We give theoretical results on
the efficiency of different randomization methods, and
apply the swap randomization method to several
well-known datasets. Our results indicate that for some
datasets the structure discovered by the data mining
algorithms is expected, given the row and column
margins of the datasets, while for other datasets the
discovered structure conveys information that is not
captured by the margin counts.",
acknowledgement = ack-nhfb,
articleno = "14",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "0--1 data; randomization tests; Significance testing;
swaps",
}
@Article{Tang:2008:TTA,
author = "Lei Tang and Huan Liu and Jianping Zhang and Nitin
Agarwal and John J. Salerno",
title = "Topic taxonomy adaptation for group profiling",
journal = j-TKDD,
volume = "1",
number = "4",
pages = "1:1--1:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1324172.1324173",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:07 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "A topic taxonomy is an effective representation that
describes salient features of virtual groups or online
communities. A topic taxonomy consists of topic nodes.
Each internal node is defined by its vertical path
(i.e., ancestor and child nodes) and its horizontal
list of attributes (or terms). In a text-dominant
environment, a topic taxonomy can be used to flexibly
describe a group's interests with varying granularity.
However, the stagnant nature of a taxonomy may fail to
timely capture the dynamic change of a group's
interest. This article addresses the problem of how to
adapt a topic taxonomy to the accumulated data that
reflects the change of a group's interest to achieve
dynamic group profiling. We first discuss the issues
related to topic taxonomy. We next formulate taxonomy
adaptation as an optimization problem to find the
taxonomy that best fits the data. We then present a
viable algorithm that can efficiently accomplish
taxonomy adaptation. We conduct extensive experiments
to evaluate our approach's efficacy for group
profiling, compare the approach with some alternatives,
and study its performance for dynamic group profiling.
While pointing out various applications of taxonomy
adaption, we suggest some future work that can take
advantage of burgeoning Web 2.0 services for online
targeted marketing, counterterrorism in connecting
dots, and community tracking.",
acknowledgement = ack-nhfb,
articleno = "1",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "dynamic profiling; group interest; taxonomy
adjustment; text hierarchical classification; Topic
taxonomy",
}
@Article{Cormode:2008:FHH,
author = "Graham Cormode and Flip Korn and S. Muthukrishnan and
Divesh Srivastava",
title = "Finding hierarchical heavy hitters in streaming data",
journal = j-TKDD,
volume = "1",
number = "4",
pages = "2:1--2:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1324172.1324174",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:07 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Data items that arrive online as streams typically
have attributes which take values from one or more
hierarchies (time and geographic location, source and
destination IP addresses, etc.). Providing an aggregate
view of such data is important for summarization,
visualization, and analysis. We develop an aggregate
view based on certain organized sets of large-valued
regions (``heavy hitters'') corresponding to
hierarchically discounted frequency counts. We formally
define the notion of {\em hierarchical heavy hitters\/}
(HHHs). We first consider computing (approximate) HHHs
over a data stream drawn from a single hierarchical
attribute. We formalize the problem and give
deterministic algorithms to find them in a single pass
over the input.\par
In order to analyze a wider range of realistic data
streams (e.g., from IP traffic-monitoring
applications), we generalize this problem to multiple
dimensions. Here, the semantics of HHHs are more
complex, since a ``child'' node can have multiple
``parent'' nodes. We present online algorithms that
find approximate HHHs in one pass, with provable
accuracy guarantees. The product of hierarchical
dimensions forms a mathematical lattice structure. Our
algorithms exploit this structure, and so are able to
track approximate HHHs using only a small, fixed number
of statistics per stored item, regardless of the number
of dimensions.\par
We show experimentally, using real data, that our
proposed algorithms yields outputs which are very
similar (virtually identical, in many cases) to offline
computations of the exact solutions, whereas
straightforward heavy-hitters-based approaches give
significantly inferior answer quality. Furthermore, the
proposed algorithms result in an order of magnitude
savings in data structure size while performing
competitively.",
acknowledgement = ack-nhfb,
articleno = "2",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "approximation algorithms; Data mining; network data
analysis",
}
@Article{Somaiya:2008:LCU,
author = "Manas Somaiya and Christopher Jermaine and Sanjay
Ranka",
title = "Learning correlations using the mixture-of-subsets
model",
journal = j-TKDD,
volume = "1",
number = "4",
pages = "3:1--3:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1324172.1324175",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:07 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Using a mixture of random variables to model data is a
tried-and-tested method common in data mining, machine
learning, and statistics. By using mixture modeling it
is often possible to accurately model even complex,
multimodal data via very simple components. However,
the classical mixture model assumes that a data point
is generated by a single component in the model. A lot
of datasets can be modeled closer to the underlying
reality if we drop this restriction. We propose a
probabilistic framework, the {\em mixture-of-subsets
(MOS) model}, by making two fundamental changes to the
classical mixture model. First, we allow a data point
to be generated by a set of components, rather than
just a single component. Next, we limit the number of
data attributes that each component can influence. We
also propose an EM framework to learn the MOS model
from a dataset, and experimentally evaluate it on real,
high-dimensional datasets. Our results show that the
MOS model learned from the data represents the
underlying nature of the data accurately.",
acknowledgement = ack-nhfb,
articleno = "3",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "EM algorithm; high-dimensional data; Mixture
modeling",
}
@Article{Halkidi:2008:CFB,
author = "M. Halkidi and D. Gunopulos and M. Vazirgiannis and N.
Kumar and C. Domeniconi",
title = "A clustering framework based on subjective and
objective validity criteria",
journal = j-TKDD,
volume = "1",
number = "4",
pages = "4:1--4:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1324172.1324176",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:07 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Clustering, as an unsupervised learning process is a
challenging problem, especially in cases of
high-dimensional datasets. Clustering result quality
can benefit from user constraints and objective
validity assessment. In this article, we propose a
semisupervised framework for learning the weighted
Euclidean subspace, where the best clustering can be
achieved. Our approach capitalizes on: (i) user
constraints; and (ii) the quality of intermediate
clustering results in terms of their structural
properties. The proposed framework uses the clustering
algorithm and the validity measure as its parameters.
We develop and discuss algorithms for learning and
tuning the weights of contributing dimensions and
defining the ``best'' clustering obtained by satisfying
user constraints. Experimental results on benchmark
datasets demonstrate the superiority of the proposed
approach in terms of improved clustering accuracy.",
acknowledgement = ack-nhfb,
articleno = "4",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "cluster validity; data mining; Semisupervised
learning; similarity measure learning; space learning",
}
@Article{Zaki:2008:ISI,
author = "Mohammed J. Zaki and George Karypis and Jiong Yang and
Wei Wang",
title = "Introduction to special issue on bioinformatics",
journal = j-TKDD,
volume = "2",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1342320.1342321",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:18 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "1",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Jin:2008:CMM,
author = "Ying Jin and T. M. Murali and Naren Ramakrishnan",
title = "Compositional mining of multirelational biological
datasets",
journal = j-TKDD,
volume = "2",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1342320.1342322",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:18 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "High-throughput biological screens are yielding
ever-growing streams of information about multiple
aspects of cellular activity. As more and more
categories of datasets come online, there is a
corresponding multitude of ways in which inferences can
be chained across them, motivating the need for
compositional data mining algorithms. In this article,
we argue that such compositional data mining can be
effectively realized by functionally cascading
redescription mining and biclustering algorithms as
primitives. Both these primitives mirror shifts of
vocabulary that can be composed in arbitrary ways to
create rich chains of inferences. Given a relational
database and its schema, we show how the schema can be
automatically compiled into a compositional data mining
program, and how different domains in the schema can be
related through logical sequences of biclustering and
redescription invocations. This feature allows us to
rapidly prototype new data mining applications,
yielding greater understanding of scientific datasets.
We describe two applications of compositional data
mining: (i) matching terms across categories of the
Gene Ontology and (ii) understanding the molecular
mechanisms underlying stress response in human cells.",
acknowledgement = ack-nhfb,
articleno = "2",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Biclustering; bioinformatics; compositional data
mining; inductive logic programming; redescription
mining",
}
@Article{Sahay:2008:DSB,
author = "Saurav Sahay and Sougata Mukherjea and Eugene
Agichtein and Ernest V. Garcia and Shamkant B. Navathe
and Ashwin Ram",
title = "Discovering semantic biomedical relations utilizing
the {Web}",
journal = j-TKDD,
volume = "2",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1342320.1342323",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:18 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "To realize the vision of a Semantic Web for Life
Sciences, discovering relations between resources is
essential. It is very difficult to automatically
extract relations from Web pages expressed in natural
language formats. On the other hand, because of the
explosive growth of information, it is difficult to
manually extract the relations. In this paper we
present techniques to automatically discover relations
between biomedical resources from the Web. For this
purpose we retrieve relevant information from Web
Search engines and Pubmed database using various
lexico-syntactic patterns as queries over SOAP web
services. The patterns are initially handcrafted but
can be progressively learnt. The extracted relations
can be used to construct and augment ontologies and
knowledge bases. Experiments are presented for general
biomedical relation discovery and domain specific
search to show the usefulness of our technique.",
acknowledgement = ack-nhfb,
articleno = "3",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Ontology construction; relation identification",
}
@Article{Ye:2008:DSA,
author = "Jieping Ye and Jianhui Chen and Ravi Janardan and
Sudhir Kumar",
title = "Developmental stage annotation of {Drosophila} gene
expression pattern images via an entire solution path
for {LDA}",
journal = j-TKDD,
volume = "2",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1342320.1342324",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:18 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/string-matching.bib;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Gene expression in a developing embryo occurs in
particular cells (spatial patterns) in a time-specific
manner (temporal patterns), which leads to the
differentiation of cell fates. Images of a {\em
Drosophila melanogaster\/} embryo at a given
developmental stage, showing a particular gene
expression pattern revealed by a gene-specific probe,
can be compared for spatial overlaps. The comparison is
fundamentally important to formulating and testing gene
interaction hypotheses. Expression pattern comparison
is most biologically meaningful when images from a
similar time point (developmental stage) are compared.
In this paper, we present LdaPath, a novel formulation
of Linear Discriminant Analysis (LDA) for automatic
developmental stage range classification. It employs
multivariate linear regression with the {$ L_1 $}-norm
penalty controlled by a regularization parameter for
feature extraction and visualization. LdaPath computes
an entire solution path for all values of
regularization parameter with essentially the same
computational cost as fitting one LDA model. Thus, it
facilitates efficient model selection. It is based on
the equivalence relationship between LDA and the least
squares method for multiclass classifications. This
equivalence relationship is established under a mild
condition, which we show empirically to hold for many
high-dimensional datasets, such as expression pattern
images. Our experiments on a collection of 2705
expression pattern images show the effectiveness of the
proposed algorithm. Results also show that the LDA
model resulting from LdaPath is sparse, and irrelevant
features may be removed. Thus, LdaPath provides a
general framework for simultaneous feature selection
and feature extraction.",
acknowledgement = ack-nhfb,
articleno = "4",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "dimensionality reduction; Gene expression pattern
image; linear discriminant analysis; linear
regression",
}
@Article{Lu:2008:ADA,
author = "Yijuan Lu and Qi Tian and Jennifer Neary and Feng Liu
and Yufeng Wang",
title = "Adaptive discriminant analysis for microarray-based
classification",
journal = j-TKDD,
volume = "2",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1342320.1342325",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:18 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Microarray technology has generated enormous amounts
of high-dimensional gene expression data, providing a
unique platform for exploring gene regulatory networks.
However, the curse of dimensionality plagues effort to
analyze these high throughput data. Linear Discriminant
Analysis (LDA) and Biased Discriminant Analysis (BDA)
are two popular techniques for dimension reduction,
which pay attention to different roles of the positive
and negative samples in finding discriminating
subspace. However, the drawbacks of these two methods
are obvious: LDA has limited efficiency in classifying
sample data from subclasses with different
distributions, and BDA does not account for the
underlying distribution of negative samples.\par
In this paper, we propose a novel dimension reduction
technique for microarray analysis: Adaptive
Discriminant Analysis (ADA), which effectively exploits
favorable attributes of both BDA and LDA and avoids
their unfavorable ones. ADA can find a good
discriminative subspace with adaptation to different
sample distributions. It not only alleviates the
problem of high dimensionality, but also enhances the
classification performance in the subspace with
na{\"\i}ve Bayes classifier. To learn the best model
fitting the real scenario, boosted Adaptive
Discriminant Analysis is further proposed. Extensive
experiments on the yeast cell cycle regulation data
set, and the expression data of the red blood cell
cycle in malaria parasite {\em Plasmodium falciparum\/}
demonstrate the superior performance of ADA and boosted
ADA. We also present some putative genes of specific
functional classes predicted by boosted ADA. Their
potential functionality is confirmed by independent
predictions based on Gene Ontology, demonstrating that
ADA and boosted ADA are effective dimension reduction
methods for microarray-based classification.",
acknowledgement = ack-nhfb,
articleno = "5",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "ADA; BDA; boosted ADA; dimension reduction; LDA;
microarray",
}
@Article{Hashimoto:2008:NEP,
author = "Kosuke Hashimoto and Kiyoko Flora Aoki-Kinoshita and
Nobuhisa Ueda and Minoru Kanehisa and Hiroshi
Mamitsuka",
title = "A new efficient probabilistic model for mining labeled
ordered trees applied to glycobiology",
journal = j-TKDD,
volume = "2",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1342320.1342326",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:18 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Mining frequent patterns from large datasets is an
important issue in data mining. Recently, complex and
unstructured (or semi-structured) datasets have
appeared as targets for major data mining applications,
including text mining, web mining and bioinformatics.
Our work focuses on labeled ordered trees, which are
typically semi-structured datasets. In bioinformatics,
carbohydrate sugar chains, or glycans, can be modeled
as labeled ordered trees. Glycans are the third major
class of biomolecules, having important roles in
signaling and recognition. For mining labeled ordered
trees, we propose a new probabilistic model and its
efficient learning scheme which significantly improves
the time and space complexity of an existing
probabilistic model for labeled ordered trees. We
evaluated the performance of the proposed model,
comparing it with those of other probabilistic models,
using synthetic as well as real datasets from
glycobiology. Experimental results showed that the
proposed model drastically reduced the computation time
of the competing model, keeping the predictive power
and avoiding overfitting to the training data. Finally,
we assessed our results on real data from a variety of
biological viewpoints, verifying known facts in
glycobiology.",
acknowledgement = ack-nhfb,
articleno = "6",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Expectation-maximization; labeled ordered trees;
maximum likelihood; probabilistic models",
}
@Article{Ge:2008:JCA,
author = "Rong Ge and Martin Ester and Byron J. Gao and Zengjian
Hu and Binay Bhattacharya and Boaz Ben-Moshe",
title = "Joint cluster analysis of attribute data and
relationship data: {The} connected $k$-center problem,
algorithms and applications",
journal = j-TKDD,
volume = "2",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1376815.1376816",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:30 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Attribute data and relationship data are two principal
types of data, representing the intrinsic and extrinsic
properties of entities. While attribute data have been
the main source of data for cluster analysis,
relationship data such as social networks or metabolic
networks are becoming increasingly available. It is
also common to observe both data types carry
complementary information such as in market
segmentation and community identification, which calls
for a joint cluster analysis of both data types so as
to achieve better results. In this article, we
introduce the novel Connected $k$-Center ({\em CkC\/})
problem, a clustering model taking into account
attribute data as well as relationship data. We analyze
the complexity of the problem and prove its
NP-hardness. Therefore, we analyze the approximability
of the problem and also present a constant factor
approximation algorithm. For the special case of the
{\em CkC\/} problem where the relationship data form a
tree structure, we propose a dynamic programming method
giving an optimal solution in polynomial time. We
further present NetScan, a heuristic algorithm that is
efficient and effective for large real databases. Our
extensive experimental evaluation on real datasets
demonstrates the meaningfulness and accuracy of the
NetScan results.",
acknowledgement = ack-nhfb,
articleno = "7",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "approximation algorithms; Attribute data; community
identification; document clustering; joint cluster
analysis; market segmentation; NP-hardness;
relationship data",
}
@Article{Gupta:2008:BBC,
author = "Gunjan Gupta and Joydeep Ghosh",
title = "{Bregman} bubble clustering: a robust framework for
mining dense clusters",
journal = j-TKDD,
volume = "2",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1376815.1376817",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:30 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In classical clustering, each data point is assigned
to at least one cluster. However, in many applications
only a small subset of the available data is relevant
for the problem and the rest needs to be ignored in
order to obtain good clusters. Certain nonparametric
density-based clustering methods find the most relevant
data as multiple dense regions, but such methods are
generally limited to low-dimensional data and do not
scale well to large, high-dimensional datasets. Also,
they use a specific notion of ``distance'', typically
Euclidean or Mahalanobis distance, which further limits
their applicability. On the other hand, the recent One
Class Information Bottleneck (OC-IB) method is fast and
works on a large class of distortion measures known as
Bregman Divergences, but can only find a {\em single\/}
dense region. This article presents a broad framework
for finding $k$ dense clusters while ignoring the rest
of the data. It includes a seeding algorithm that can
automatically determine a suitable value for {\em k}.
When $k$ is forced to 1, our method gives rise to an
improved version of OC-IB with optimality guarantees.
We provide a generative model that yields the proposed
iterative algorithm for finding $k$ dense regions as a
special case. Our analysis reveals an interesting and
novel connection between the problem of finding dense
regions and exponential mixture models; a hard model
corresponding to $k$ exponential mixtures with a
uniform background results in a set of $k$ dense
clusters. The proposed method describes a highly
scalable algorithm for finding multiple dense regions
that works with any Bregman Divergence, thus extending
density based clustering to a variety of non-Euclidean
problems not addressable by earlier methods. We present
empirical results on three artificial, two microarray
and one text dataset to show the relevance and
effectiveness of our methods.",
acknowledgement = ack-nhfb,
articleno = "8",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Bregman divergences; Density-based clustering;
expectation maximization; exponential family; One Class
classification",
}
@Article{Tan:2008:TMG,
author = "Henry Tan and Fedja Hadzic and Tharam S. Dillon and
Elizabeth Chang and Ling Feng",
title = "Tree model guided candidate generation for mining
frequent subtrees from {XML} documents",
journal = j-TKDD,
volume = "2",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1376815.1376818",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:30 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Due to the inherent flexibilities in both structure
and semantics, XML association rules mining faces few
challenges, such as: a more complicated hierarchical
data structure and ordered data context. Mining
frequent patterns from XML documents can be recast as
mining frequent tree structures from a database of XML
documents. In this study, we model a database of XML
documents as a database of rooted labeled ordered
subtrees. In particular, we are mainly concerned with
mining frequent induced and embedded ordered subtrees.
Our main contributions are as follows. We describe our
unique {\em embedding list\/} representation of the
tree structure, which enables efficient implementation
of our {\em Tree Model Guided\/} ({\em TMG\/})
candidate generation. {\em TMG\/} is an optimal,
nonredundant enumeration strategy that enumerates all
the valid candidates that conform to the structural
aspects of the data. We show through a mathematical
model and experiments that {\em TMG\/} has better
complexity compared to the commonly used join approach.
In this article, we propose two algorithms, MB3-Miner
and iMB3-Miner. MB3-Miner mines embedded subtrees.
iMB3-Miner mines induced and/or embedded subtrees by
using the {\em maximum level of embedding constraint}.
Our experiments with both synthetic and real datasets
against two well-known algorithms for mining induced
and embedded subtrees, demonstrate the effectiveness
and the efficiency of the proposed techniques.",
acknowledgement = ack-nhfb,
articleno = "9",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "FREQT; TMG; Tree mining; tree model guided;
TreeMiner",
}
@Article{Islam:2008:STS,
author = "Aminul Islam and Diana Inkpen",
title = "Semantic text similarity using corpus-based word
similarity and string similarity",
journal = j-TKDD,
volume = "2",
number = "2",
pages = "10:1--10:??",
month = jul,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1376815.1376819",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:30 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We present a method for measuring the semantic
similarity of texts using a corpus-based measure of
semantic word similarity and a normalized and modified
version of the Longest Common Subsequence (LCS) string
matching algorithm. Existing methods for computing text
similarity have focused mainly on either large
documents or individual words. We focus on computing
the similarity between two sentences or two short
paragraphs. The proposed method can be exploited in a
variety of applications involving textual knowledge
representation and knowledge discovery. Evaluation
results on two different data sets show that our method
outperforms several competing methods.",
acknowledgement = ack-nhfb,
articleno = "10",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "corpus-based measures; Semantic similarity of words;
similarity of short texts",
}
@Article{Sun:2008:ITA,
author = "Jimeng Sun and Dacheng Tao and Spiros Papadimitriou
and Philip S. Yu and Christos Faloutsos",
title = "Incremental tensor analysis: {Theory} and
applications",
journal = j-TKDD,
volume = "2",
number = "3",
pages = "11:1--11:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1409620.1409621",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:41 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "How do we find patterns in author-keyword
associations, evolving over time? Or in data cubes
(tensors), with product-branchcustomer sales
information? And more generally, how to summarize
high-order data cubes (tensors)? How to incrementally
update these patterns over time? Matrix decompositions,
like principal component analysis (PCA) and variants,
are invaluable tools for mining, dimensionality
reduction, feature selection, rule identification in
numerous settings like streaming data, text, graphs,
social networks, and many more settings. However, they
have only two orders (i.e., matrices, like author and
keyword in the previous example).\par
We propose to envision such higher-order data as
tensors, and tap the vast literature on the topic.
However, these methods do not necessarily scale up, let
alone operate on semi-infinite streams. Thus, we
introduce a general framework, incremental tensor
analysis (ITA), which efficiently computes a compact
summary for high-order and high-dimensional data, and
also reveals the hidden correlations. Three variants of
ITA are presented: (1) dynamic tensor analysis (DTA);
(2) streaming tensor analysis (STA); and (3)
window-based tensor analysis (WTA). In particular, we
explore several fundamental design trade-offs such as
space efficiency, computational cost, approximation
accuracy, time dependency, and model complexity.\par
We implement all our methods and apply them in several
real settings, such as network anomaly detection,
multiway latent semantic indexing on citation networks,
and correlation study on sensor measurements. Our
empirical studies show that the proposed methods are
fast and accurate and that they find interesting
patterns and outliers on the real datasets.",
acknowledgement = ack-nhfb,
articleno = "11",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "multilinear algebra; stream mining; Tensor",
}
@Article{Mangasarian:2008:PPC,
author = "Olvi L. Mangasarian and Edward W. Wild and Glenn M.
Fung",
title = "Privacy-preserving classification of vertically
partitioned data via random kernels",
journal = j-TKDD,
volume = "2",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1409620.1409622",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:41 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We propose a novel privacy-preserving support vector
machine (SVM) classifier for a data matrix $A$ whose
input feature columns are divided into groups belonging
to different entities. Each entity is unwilling to
share its group of columns or make it public. Our
classifier is based on the concept of a reduced kernel
$k(A, B\prime)$, where $B\prime$ is the transpose of a
random matrix $B$. The column blocks of $B$
corresponding to the different entities are privately
generated by each entity and never made public. The
proposed linear or nonlinear SVM classifier, which is
public but does not reveal any of the privately held
data, has accuracy comparable to that of an ordinary
SVM classifier that uses the entire set of input
features directly.",
acknowledgement = ack-nhfb,
articleno = "12",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Privacy preserving classification; support vector
machines; vertically partitioned data",
}
@Article{Lakshmanan:2008:DRA,
author = "Laks V. S. Lakshmanan and Raymond T. Ng and Ganesh
Ramesh",
title = "On disclosure risk analysis of anonymized itemsets in
the presence of prior knowledge",
journal = j-TKDD,
volume = "2",
number = "3",
pages = "13:1--13:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1409620.1409623",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:41 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Decision makers of companies often face the dilemma of
whether to release data for knowledge discovery,
vis-a-vis the risk of disclosing proprietary or
sensitive information. Among the various methods
employed for ``sanitizing'' the data prior to
disclosure, we focus in this article on anonymization,
given its widespread use in practice. We do due
diligence to the question ``just how safe is the
anonymized data?'' We consider both those scenarios
when the hacker has no information and, more
realistically, when the hacker may have partial
information about items in the domain. We conduct our
analyses in the context of frequent set mining and
address the safety question at two different levels:
(i) how likely of being cracked (i.e., re-identified by
a hacker), are the identities of individual items and
(ii) how likely are sets of items cracked? For
capturing the prior knowledge of the hacker, we propose
a {\em belief function}, which amounts to an educated
guess of the frequency of each item. For various
classes of belief functions which correspond to
different degrees of prior knowledge, we derive
formulas for computing the expected number of cracks of
single items and for itemsets, the probability of
cracking the itemsets. While obtaining, exact values
for more general situations is computationally hard, we
propose a series of heuristics called the {\em
O-estimates}. They are easy to compute and are shown
fairly accurate, justified by empirical results on real
benchmark datasets. Based on the O-estimates, we
propose a recipe for the decision makers to resolve
their dilemma. Our recipe operates at two different
levels, depending on whether the data owner wants to
reason in terms of single items or sets of items (or
both). Finally, we present techniques for ascertaining
a hacker's knowledge of correlation in terms of
co-occurrence of items likely. This information
regarding the hacker's knowledge can be incorporated
into our framework of disclosure risk analysis and we
present experimental results demonstrating how this
knowledge affects the heuristic estimates we have
developed.",
acknowledgement = ack-nhfb,
articleno = "13",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "anonymization; belief function; bipartite graphs;
correlation; Disclosure risk; frequent itemsets;
hacker; matching; prior knowledge; sampling",
}
@Article{Vaidya:2008:PPD,
author = "Jaideep Vaidya and Chris Clifton and Murat
Kantarcioglu and A. Scott Patterson",
title = "Privacy-preserving decision trees over vertically
partitioned data",
journal = j-TKDD,
volume = "2",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1409620.1409624",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:41 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Privacy and security concerns can prevent sharing of
data, derailing data-mining projects. Distributed
knowledge discovery, if done correctly, can alleviate
this problem. We introduce a generalized
privacy-preserving variant of the ID3 algorithm for
vertically partitioned data distributed over two or
more parties. Along with a proof of security, we
discuss what would be necessary to make the protocols
completely secure. We also provide experimental
results, giving a first demonstration of the practical
complexity of secure multiparty computation-based data
mining.",
acknowledgement = ack-nhfb,
articleno = "14",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Decision tree classification; privacy",
}
@Article{Chuang:2009:FPS,
author = "Kun-Ta Chuang and Hung-Leng Chen and Ming-Syan Chen",
title = "Feature-preserved sampling over streaming data",
journal = j-TKDD,
volume = "2",
number = "4",
pages = "15:1--15:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1460797.1460798",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:51 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In this article, we explore a novel sampling model,
called {\em feature preserved sampling\/} ({\em FPS\/})
that sequentially generates a high-quality sample over
sliding windows. The sampling quality we consider
refers to the degree of consistency between the sample
proportion and the population proportion of each
attribute value in a window. Due to the time-variant
nature of real-world datasets, users are more likely to
be interested in the most recent data. However,
previous works have not been able to generate a
high-quality sample over sliding windows that precisely
preserves up-to-date population characteristics.
Motivated by this shortcoming, we have developed the
{\em FPS\/} algorithm, which has several advantages:
(1) it sequentially generates a sample from a
time-variant data source over sliding windows; (2) the
execution time of {\em FPS\/} is linear with respect to
the database size; (3) the {\em relative\/}
proportional differences between the sample proportions
and population proportions of most distinct attribute
values are guaranteed to be below a specified error
threshold, $\epsilon$ , while the {\em relative\/}
proportion differences of the remaining attribute
values are as close to $\epsilon$ as possible, which
ensures that the generated sample is of high quality;
(4) the sample rate is close to the user specified rate
so that a high quality sampling result can be obtained
without increasing the sample size; (5) by a thorough
analytical and empirical study, we prove that {\em
FPS\/} has acceptable space overheads, especially when
the attribute values have Zipfian distributions, and
{\em FPS\/} can also excellently preserve the
population proportion of multivariate features in the
sample; and (6) {\em FPS\/} can be applied to infinite
streams and finite datasets equally, and the generated
samples can be used for various applications. Our
experiments on both real and synthetic data validate
that {\em FPS\/} can effectively obtain a high quality
sample of the desired size. In addition, while using
the sample generated by {\em FPS\/} in various mining
applications, a significant improvement in efficiency
can be achieved without compromising the model's
precision.",
acknowledgement = ack-nhfb,
articleno = "15",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "sampling; Streaming mining",
}
@Article{Jiang:2009:MFC,
author = "Daxin Jiang and Jian Pei",
title = "Mining frequent cross-graph quasi-cliques",
journal = j-TKDD,
volume = "2",
number = "4",
pages = "16:1--16:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1460797.1460799",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:51 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Joint mining of multiple datasets can often discover
interesting, novel, and reliable patterns which cannot
be obtained solely from any single source. For example,
in bioinformatics, jointly mining multiple gene
expression datasets obtained by different labs or
during various biological processes may overcome the
heavy noise in the data. Moreover, by joint mining of
gene expression data and protein-protein interaction
data, we may discover clusters of genes which show
coherent expression patterns and also produce
interacting proteins. Such clusters may be potential
pathways.\par
In this article, we investigate a novel data mining
problem, {\em mining frequent cross-graph
quasi-cliques}, which is generalized from several
interesting applications in bioinformatics,
cross-market customer segmentation, social network
analysis, and Web mining. In a graph, a set of vertices
$S$ is a $\gamma$-quasi-clique $(0 < \gamma \leq 1)$ if
each vertex $v$ in $S$ directly connects to at least
$\gamma \cdot (|S| - 1)$ other vertices in $S$. Given a
set of graphs $G_1, \ldots{}, G_n$ and parameter ${\rm
min\_sup} (0 < {\rm min\_sup} 1)$, a set of vertices
$S$ is a frequent cross-graph quasi-clique if $S$ is a
$\gamma$-quasi-clique in at least ${\rm min\_sup} \cdot
n$ graphs, and there does not exist a proper superset
of $S$ having the property.\par
We build a general model, show why the complete set of
frequent cross-graph quasi-cliques cannot be found by
previous data mining methods, and study the complexity
of the problem. While the problem is difficult, we
develop practical algorithms which exploit several
interesting and effective techniques and heuristics to
efficaciously mine frequent cross-graph quasi-cliques.
A systematic performance study is reported on both
synthetic and real data sets. We demonstrate some
interesting and meaningful frequent cross-graph
quasi-cliques in bioinformatics. The experimental
results also show that our algorithms are efficient and
scalable.",
acknowledgement = ack-nhfb,
articleno = "16",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "bioinformatics; clique; Graph mining; joint mining",
}
@Article{Domeniconi:2009:WCE,
author = "Carlotta Domeniconi and Muna Al-Razgan",
title = "Weighted cluster ensembles: {Methods} and analysis",
journal = j-TKDD,
volume = "2",
number = "4",
pages = "17:1--17:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1460797.1460800",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:51 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Cluster ensembles offer a solution to challenges
inherent to clustering arising from its ill-posed
nature. Cluster ensembles can provide robust and stable
solutions by leveraging the consensus across multiple
clustering results, while averaging out emergent
spurious structures that arise due to the various
biases to which each participating algorithm is tuned.
In this article, we address the problem of combining
multiple {\em weighted clusters\/} that belong to
different subspaces of the input space. We leverage the
diversity of the input clusterings in order to generate
a consensus partition that is superior to the
participating ones. Since we are dealing with weighted
clusters, our consensus functions make use of the
weight vectors associated with the clusters. We
demonstrate the effectiveness of our techniques by
running experiments with several real datasets,
including high-dimensional text data. Furthermore, we
investigate in depth the issue of diversity and
accuracy for our ensemble methods. Our analysis and
experimental results show that the proposed techniques
are capable of producing a partition that is as good as
or better than the best individual clustering.",
acknowledgement = ack-nhfb,
articleno = "17",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "accuracy and diversity measures; Cluster ensembles;
consensus functions; data mining; subspace clustering;
text data",
}
@Article{Zhang:2009:DGA,
author = "Zhenjie Zhang and Laks V. S. Lakshmanan and Anthony K.
H. Tung",
title = "On domination game analysis for microeconomic data
mining",
journal = j-TKDD,
volume = "2",
number = "4",
pages = "18:1--18:??",
month = jan,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1460797.1460801",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 17:59:51 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Game theory is a powerful tool for analyzing the
competitions among manufacturers in a market. In this
article, we present a study on combining game theory
and data mining by introducing the concept of
domination game analysis. We present a multidimensional
market model, where every dimension represents one
attribute of a commodity. Every product or customer is
represented by a point in the multidimensional space,
and a product is said to ``dominate'' a customer if all
of its attributes can satisfy the requirements of the
customer. The expected market share of a product is
measured by the expected number of the buyers in the
customers, all of which are equally likely to buy any
product dominating him. A Nash equilibrium is a
configuration of the products achieving stable expected
market shares for all products. We prove that Nash
equilibrium in such a model can be computed in
polynomial time if every manufacturer tries to modify
its product in a round robin manner. To further improve
the efficiency of the computation, we also design two
algorithms for the manufacturers to efficiently find
their best response to other products in the market.",
acknowledgement = ack-nhfb,
articleno = "18",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "data mining; Domination game; game theory",
}
@Article{Kriegel:2009:CHD,
author = "Hans-Peter Kriegel and Peer Kr{\"o}ger and Arthur
Zimek",
title = "Clustering high-dimensional data: {A} survey on
subspace clustering, pattern-based clustering, and
correlation clustering",
journal = j-TKDD,
volume = "3",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1497577.1497578",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:01 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "As a prolific research area in data mining, subspace
clustering and related problems induced a vast quantity
of proposed solutions. However, many publications
compare a new proposition --- if at all --- with one or
two competitors, or even with a so-called
``na{\"\i}ve'' ad hoc solution, but fail to clarify the
exact problem definition. As a consequence, even if two
solutions are thoroughly compared experimentally, it
will often remain unclear whether both solutions tackle
the same problem or, if they do, whether they agree in
certain tacit assumptions and how such assumptions may
influence the outcome of an algorithm. In this survey,
we try to clarify: (i) the different problem
definitions related to subspace clustering in general;
(ii) the specific difficulties encountered in this
field of research; (iii) the varying assumptions,
heuristics, and intuitions forming the basis of
different approaches; and (iv) how several prominent
solutions tackle different problems.",
acknowledgement = ack-nhfb,
articleno = "1",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "clustering; high-dimensional data; Survey",
}
@Article{Dhurandhar:2009:SAM,
author = "Amit Dhurandhar and Alin Dobra",
title = "Semi-analytical method for analyzing models and model
selection measures based on moment analysis",
journal = j-TKDD,
volume = "3",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1497577.1497579",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:01 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In this article we propose a moment-based method for
studying models and model selection measures. By
focusing on the probabilistic space of classifiers
induced by the classification algorithm rather than on
that of datasets, we obtain efficient characterizations
for computing the moments, which is followed by
visualization of the resulting formulae that are too
complicated for direct interpretation. By assuming the
data to be drawn independently and identically
distributed from the underlying probability
distribution, and by going over the space of all
possible datasets, we establish general relationships
between the generalization error, hold-out-set error,
cross-validation error, and leave-one-out error. We
later exemplify the method and the results by studying
the behavior of the errors for the naive Bayes
classifier.",
acknowledgement = ack-nhfb,
articleno = "2",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "classification; generalization error; Model
selection",
}
@Article{Cerf:2009:CPM,
author = "Lo{\"\i}c Cerf and J{\'e}r{\'e}my Besson and
C{\'e}line Robardet and Jean-Fran{\c{c}}ois Boulicaut",
title = "Closed patterns meet $n$-ary relations",
journal = j-TKDD,
volume = "3",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1497577.1497580",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:01 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Set pattern discovery from binary relations has been
extensively studied during the last decade. In
particular, many complete and efficient algorithms for
frequent closed set mining are now available.
Generalizing such a task to $n$-ary relations ($n \geq
2$) appears as a timely challenge. It may be important
for many applications, for example, when adding the
time dimension to the popular {\em objects\/} $\times$
{\em features\/} binary case. The generality of the
task (no assumption being made on the relation arity or
on the size of its attribute domains) makes it
computationally challenging. We introduce an algorithm
called Data-Peeler. From an $n$-ary relation, it
extracts all closed $n$-sets satisfying given piecewise
(anti) monotonic constraints. This new class of
constraints generalizes both monotonic and
antimonotonic constraints. Considering the special case
of ternary relations, Data-Peeler outperforms the
state-of-the-art algorithms CubeMiner and Trias by
orders of magnitude. These good performances must be
granted to a new clever enumeration strategy allowing
to efficiently enforce the closeness property. The
relevance of the extracted closed $n$-sets is assessed
on real-life 3-and 4-ary relations. Beyond natural 3-or
4-ary relations, expanding a relation with an
additional attribute can help in enforcing rather
abstract constraints such as the robustness with
respect to binarization. Furthermore, a collection of
closed $n$-sets is shown to be an excellent starting
point to compute a tiling of the dataset.",
acknowledgement = ack-nhfb,
articleno = "3",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "$n$-ary relations; Closed patterns; constraint
properties; constraint-based mining; tiling",
}
@Article{Angiulli:2009:DEA,
author = "Fabrizio Angiulli and Fabio Fassetti",
title = "{DOLPHIN}: {An} efficient algorithm for mining
distance-based outliers in very large datasets",
journal = j-TKDD,
volume = "3",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1497577.1497581",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:01 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In this work a novel distance-based outlier detection
algorithm, named DOLPHIN, working on disk-resident
datasets and whose I/O cost corresponds to the cost of
sequentially reading the input dataset file twice, is
presented.\par
It is both theoretically and empirically shown that the
main memory usage of DOLPHIN amounts to a small
fraction of the dataset and that DOLPHIN has linear
time performance with respect to the dataset size.
DOLPHIN gains efficiency by naturally merging together
in a unified schema three strategies, namely the
selection policy of objects to be maintained in main
memory, usage of pruning rules, and similarity search
techniques. Importantly, similarity search is
accomplished by the algorithm without the need of
preliminarily indexing the whole dataset, as other
methods do.\par
The algorithm is simple to implement and it can be used
with any type of data, belonging to either metric or
nonmetric spaces. Moreover, a modification to the basic
method allows DOLPHIN to deal with the scenario in
which the available buffer of main memory is smaller
than its standard requirements. DOLPHIN has been
compared with state-of-the-art distance-based outlier
detection algorithms, showing that it is much more
efficient.",
acknowledgement = ack-nhfb,
articleno = "4",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Data mining; distance-based outliers; outlier
detection",
}
@Article{Chen:2009:BAS,
author = "Bee-Chung Chen and Raghu Ramakrishnan and Jude W.
Shavlik and Pradeep Tamma",
title = "Bellwether analysis: {Searching} for cost-effective
query-defined predictors in large databases",
journal = j-TKDD,
volume = "3",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1497577.1497582",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:01 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "How to mine massive datasets is a challenging problem
with great potential value. Motivated by this
challenge, much effort has concentrated on developing
scalable versions of machine learning algorithms.
However, the cost of mining large datasets is not just
computational; preparing the datasets into the ``right
form'' so that learning algorithms can be applied is
usually costly, due to the human labor that is
typically required and a large number of choices in
data preparation, which include selecting different
subsets of data and aggregating data at different
granularities. We make the key observation that, for a
number of practically motivated problems, these choices
can be defined using database queries and analyzed in
an automatic and systematic manner. Specifically, we
propose a new class of data-mining problem, called {\em
bellwether analysis}, in which the goal is to find a
few query-defined predictors (e.g., first week sales of
Peoria, IL of an item) that can be used to accurately
predict the result of a target query (e.g., first year
worldwide sales of the item) from a large number of
queries that define candidate predictors. To make a
prediction for a new item, the data needed to generate
such predictors has to be collected (e.g., selling the
new item in Peoria, IL for a week and collecting the
sales data). A useful predictor is one that has high
prediction accuracy and a low data-collection cost. We
call such a cost-effective predictor a {\em
bellwether}.\par
This article introduces bellwether analysis, which
integrates database query processing and predictive
modeling into a single framework, and provides scalable
algorithms for large datasets that cannot fit in main
memory. Through a series of extensive experiments, we
show that bellwethers do exist in real-world databases,
and that our computation techniques achieve good
efficiency on large datasets.",
acknowledgement = ack-nhfb,
articleno = "5",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "bellwether; Cost-effective prediction; data cube; OLAP
queries; predictive models; scalable algorithms",
}
@Article{Liu:2009:ISI,
author = "Huan Liu and John Salerno and Michael Young and Rakesh
Agrawal and Philip S. Yu",
title = "Introduction to special issue on social computing,
behavioral modeling, and prediction",
journal = j-TKDD,
volume = "3",
number = "2",
pages = "6:1--6:??",
month = apr,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1514888.1514889",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:12 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "6",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Mehler:2009:ENC,
author = "Andrew Mehler and Steven Skiena",
title = "Expanding network communities from representative
examples",
journal = j-TKDD,
volume = "3",
number = "2",
pages = "7:1--7:??",
month = apr,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1514888.1514890",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:12 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We present an approach to leverage a small subset of a
coherent community within a social network into a much
larger, more representative sample. Our problem becomes
identifying a small conductance subgraph containing
many (but not necessarily all) members of the given
seed set. Starting with an initial seed set
representing a sample of a community, we seek to
discover as much of the full community as
possible.\par
We present a general method for network community
expansion, demonstrating that our methods work well in
expanding communities in real world networks starting
from small given seed groups (20 to 400 members). Our
approach is marked by incremental expansion from the
seeds with retrospective analysis to determine the
ultimate boundaries of our community. We demonstrate
how to increase the robustness of the general approach
through bootstrapping multiple random partitions of the
input set into seed and evaluation groups.\par
We go beyond statistical comparisons against gold
standards to careful subjective evaluations of our
expanded communities. This process explains the causes
of most disagreement between our expanded communities
and our gold-standards --- arguing that our expansion
methods provide more reliable communities than can be
extracted from reference sources/gazetteers such as
Wikipedia.",
acknowledgement = ack-nhfb,
articleno = "7",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "artificial intelligence; community discovery; Discrete
mathematics; graph theory; news analysis; social
networks",
}
@Article{Lin:2009:ACT,
author = "Yu-Ru Lin and Yun Chi and Shenghuo Zhu and Hari
Sundaram and Belle L. Tseng",
title = "Analyzing communities and their evolutions in dynamic
social networks",
journal = j-TKDD,
volume = "3",
number = "2",
pages = "8:1--8:??",
month = apr,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1514888.1514891",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:12 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We discover communities from social network data and
analyze the community evolution. These communities are
inherent characteristics of human interaction in online
social networks, as well as paper citation networks.
Also, communities may evolve over time, due to changes
to individuals' roles and social status in the network
as well as changes to individuals' research interests.
We present an innovative algorithm that deviates from
the traditional two-step approach to analyze community
evolutions. In the traditional approach, communities
are first detected for each time slice, and then
compared to determine correspondences. We argue that
this approach is inappropriate in applications with
noisy data. In this paper, we propose {\em FacetNet\/}
for analyzing communities and their evolutions through
a robust {\em unified\/} process. This novel framework
will discover communities and capture their evolution
with temporal smoothness given by historic community
structures. Our approach relies on formulating the
problem in terms of maximum a posteriori (MAP)
estimation, where the community structure is estimated
both by the observed networked data and by the prior
distribution given by historic community structures.
Then we develop an iterative algorithm, with proven low
time complexity, which is guaranteed to converge to an
optimal solution. We perform extensive experimental
studies, on both synthetic datasets and real datasets,
to demonstrate that our method discovers meaningful
communities and provides additional insights not
directly obtainable from traditional methods.",
acknowledgement = ack-nhfb,
articleno = "8",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Community; community net; evolution; evolution net;
nonnegative matrix factorization; soft membership",
}
@Article{Kimura:2009:BLM,
author = "Masahiro Kimura and Kazumi Saito and Hiroshi Motoda",
title = "Blocking links to minimize contamination spread in a
social network",
journal = j-TKDD,
volume = "3",
number = "2",
pages = "9:1--9:??",
month = apr,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1514888.1514892",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:12 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We address the problem of minimizing the propagation
of undesirable things, such as computer viruses or
malicious rumors, by blocking a limited number of links
in a network, which is converse to the influence
maximization problem in which the most influential
nodes for information diffusion is searched in a social
network. This minimization problem is more fundamental
than the problem of preventing the spread of
contamination by removing nodes in a network. We
introduce two definitions for the contamination degree
of a network, accordingly define two contamination
minimization problems, and propose methods for
efficiently finding good approximate solutions to these
problems on the basis of a naturally greedy strategy.
Using large social networks, we experimentally
demonstrate that the proposed methods outperform
conventional link-removal methods. We also show that
unlike the case of blocking a limited number of nodes,
the strategy of removing nodes with high out-degrees is
not necessarily effective for these problems.",
acknowledgement = ack-nhfb,
articleno = "9",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Contamination diffusion; link analysis; social
networks",
}
@Article{Agichtein:2009:MIS,
author = "Eugene Agichtein and Yandong Liu and Jiang Bian",
title = "Modeling information-seeker satisfaction in community
question answering",
journal = j-TKDD,
volume = "3",
number = "2",
pages = "10:1--10:??",
month = apr,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1514888.1514893",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Fri Apr 24 18:00:12 MDT 2009",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Question Answering Communities such as Naver, Baidu
Knows, and Yahoo! Answers have emerged as popular, and
often effective, means of information seeking on the
web. By posting questions for other participants to
answer, information seekers can obtain specific answers
to their questions. Users of CQA portals have already
contributed millions of questions, and received
hundreds of millions of answers from other
participants. However, CQA is not always effective: in
some cases, a user may obtain a perfect answer within
minutes, and in others it may require hours --- and
sometimes days --- until a satisfactory answer is
contributed. We investigate the problem of predicting
information seeker satisfaction in collaborative
question answering communities, where we attempt to
predict whether a question author will be satisfied
with the answers submitted by the community
participants. We present a general prediction model,
and develop a variety of content, structure, and
community-focused features for this task. Our
experimental results, obtained from a large-scale
evaluation over thousands of real questions and user
ratings, demonstrate the feasibility of modeling and
predicting asker satisfaction. We complement our
results with a thorough investigation of the
interactions and information seeking patterns in
question answering communities that correlate with
information seeker satisfaction. We also explore {\em
personalized\/} models of asker satisfaction, and show
that when sufficient interaction history exists,
personalization can significantly improve prediction
accuracy over a ``one-size-fits-all'' model. Our models
and predictions could be useful for a variety of
applications, such as user intent inference, answer
ranking, interface design, and query suggestion and
routing.",
acknowledgement = ack-nhfb,
articleno = "10",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Community question answering; information seeker
satisfaction",
}
@Article{Torvik:2009:AND,
author = "Vetle I. Torvik and Neil R. Smalheiser",
title = "Author name disambiguation in {MEDLINE}",
journal = j-TKDD,
volume = "3",
number = "3",
pages = "11:1--11:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1552303.1552304",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:36:58 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "{\em Background\/}: We recently described
``Author-ity,'' a model for estimating the probability
that two articles in MEDLINE, sharing the same author
name, were written by the same individual. Features
include shared title words, journal name, coauthors,
medical subject headings, language, affiliations, and
author name features (middle initial, suffix, and
prevalence in MEDLINE). Here we test the hypothesis
that the Author-ity model will suffice to disambiguate
author names for the vast majority of articles in
MEDLINE. {\em Methods\/}: Enhancements include: (a)
incorporating first names and their variants, email
addresses, and correlations between specific last names
and affiliation words; (b) new methods of generating
large unbiased training sets; (c) new methods for
estimating the prior probability; (d) a weighted least
squares algorithm for correcting transitivity
violations; and (e) a maximum likelihood based
agglomerative algorithm for computing clusters of
articles that represent inferred author-individuals.
{\em Results\/}: Pairwise comparisons were computed for
all author names on all 15.3 million articles in
MEDLINE (2006 baseline), that share last name and first
initial, to create Author-ity 2006, a database that has
each name on each article assigned to one of 6.7
million inferred author-individual clusters. Recall is
estimated at $\approx 98.8\%$. Lumping (putting two
different individuals into the same cluster) affects
$\approx 0.5\%$ of clusters, whereas splitting
(assigning articles written by the same individual to
$> 1$ cluster) affects $\approx 2\%$ of articles. {\em
Impact\/}: The Author-ity model can be applied
generally to other bibliographic databases. Author name
disambiguation allows information retrieval and data
integration to become {\em person-centered}, not just
{\em document-centered}, setting the stage for new data
mining and social network tools that will facilitate
the analysis of scholarly publishing and collaboration
behavior. {\em Availability\/}: The Author-ity 2006
database is available for nonprofit academic research,
and can be freely queried via
http://arrowsmith.psych.uic.edu.",
acknowledgement = ack-nhfb,
articleno = "11",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "bibliographic databases; Name disambiguation",
}
@Article{Tu:2009:SDC,
author = "Li Tu and Yixin Chen",
title = "Stream data clustering based on grid density and
attraction",
journal = j-TKDD,
volume = "3",
number = "3",
pages = "12:1--12:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1552303.1552305",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:36:58 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Clustering real-time stream data is an important and
challenging problem. Existing algorithms such as
CluStream are based on the {\em k\/} -means algorithm.
These clustering algorithms have difficulties finding
clusters of arbitrary shapes and handling outliers.
Further, they require the knowledge of {\em k\/} and
user-specified time window. To address these issues,
this article proposes {\em D-Stream}, a framework for
clustering stream data using a density-based
approach.\par
Our algorithm uses an online component that maps each
input data record into a grid and an offline component
that computes the grid density and clusters the grids
based on the density. The algorithm adopts a density
decaying technique to capture the dynamic changes of a
data stream and a attraction-based mechanism to
accurately generate cluster boundaries.\par
Exploiting the intricate relationships among the decay
factor, attraction, data density, and cluster
structure, our algorithm can efficiently and
effectively generate and adjust the clusters in real
time. Further, a theoretically sound technique is
developed to detect and remove sporadic grids mapped by
outliers in order to dramatically improve the space and
time efficiency of the system. The technique makes
high-speed data stream clustering feasible without
degrading the clustering quality. The experimental
results show that our algorithm has superior quality
and efficiency, can find clusters of arbitrary shapes,
and can accurately recognize the evolving behaviors of
real-time data streams.",
acknowledgement = ack-nhfb,
articleno = "12",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "clustering; data mining; density-based algorithms;
Stream data",
}
@Article{Zhou:2009:LST,
author = "Bin Zhou and Jian Pei",
title = "Link spam target detection using page farms",
journal = j-TKDD,
volume = "3",
number = "3",
pages = "13:1--13:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1552303.1552306",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:36:58 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Currently, most popular Web search engines adopt some
link-based ranking methods such as PageRank. Driven by
the huge potential benefit of improving rankings of Web
pages, many tricks have been attempted to boost page
rankings. The most common way, which is known as link
spam, is to make up some artificially designed link
structures. Detecting link spam effectively is a big
challenge. In this article, we develop novel and
effective detection methods for link spam target pages
using page farms. The essential idea is intuitive:
whether a page is the beneficiary of link spam is
reflected by how it collects its PageRank score.
Technically, how a target page collects its PageRank
score is modeled by a page farm, which consists of
pages contributing a major portion of the PageRank
score of the target page. We propose two spamicity
measures based on page farms. They can be used as an
effective measure to check whether the pages are link
spam target pages. An empirical study using a newly
available real dataset strongly suggests that our
method is effective. It outperforms the
state-of-the-art methods like SpamRank and SpamMass in
both precision and recall.",
acknowledgement = ack-nhfb,
articleno = "13",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Link Spam; Page Farm; PageRank",
}
@Article{Wan:2009:DBC,
author = "Li Wan and Wee Keong Ng and Xuan Hong Dang and Philip
S. Yu and Kuan Zhang",
title = "Density-based clustering of data streams at multiple
resolutions",
journal = j-TKDD,
volume = "3",
number = "3",
pages = "14:1--14:??",
month = jul,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1552303.1552307",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:36:58 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In data stream clustering, it is desirable to have
algorithms that are able to detect clusters of
arbitrary shape, clusters that evolve over time, and
clusters with noise. Existing stream data clustering
algorithms are generally based on an online-offline
approach: The online component captures synopsis
information from the data stream (thus, overcoming
real-time and memory constraints) and the offline
component generates clusters using the stored synopsis.
The online-offline approach affects the overall
performance of stream data clustering in various ways:
the ease of deriving synopsis from streaming data; the
complexity of data structure for storing and managing
synopsis; and the frequency at which the offline
component is used to generate clusters. In this
article, we propose an algorithm that (1) computes and
updates synopsis information in constant time; (2)
allows users to discover clusters at multiple
resolutions; (3) determines the right time for users to
generate clusters from the synopsis information; (4)
generates clusters of higher purity than existing
algorithms; and (5) determines the right threshold
function for density-based clustering based on the
fading model of stream data. To the best of our
knowledge, no existing data stream algorithms has all
of these features. Experimental results show that our
algorithm is able to detect arbitrarily shaped,
evolving clusters with high quality.",
acknowledgement = ack-nhfb,
articleno = "14",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Data mining algorithms; density based clustering;
evolving data streams",
}
@Article{Mannila:2009:ATS,
author = "Heikki Mannila and Dimitrios Gunopulos",
title = "{ACM TKDD} special issue {ACM SIGKDD 2007} and {ACM
SIGKDD 2008}",
journal = j-TKDD,
volume = "3",
number = "4",
pages = "15:1--15:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1631162.1631163",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:13 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "15",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Asur:2009:EBF,
author = "Sitaram Asur and Srinivasan Parthasarathy and Duygu
Ucar",
title = "An event-based framework for characterizing the
evolutionary behavior of interaction graphs",
journal = j-TKDD,
volume = "3",
number = "4",
pages = "16:1--16:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1631162.1631164",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:13 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Interaction graphs are ubiquitous in many fields such
as bioinformatics, sociology and physical sciences.
There have been many studies in the literature targeted
at studying and mining these graphs. However, almost
all of them have studied these graphs from a static
point of view. The study of the evolution of these
graphs over time can provide tremendous insight on the
behavior of entities, communities and the flow of
information among them. In this work, we present an
event-based characterization of critical behavioral
patterns for temporally varying interaction graphs. We
use nonoverlapping snapshots of interaction graphs and
develop a framework for capturing and identifying
interesting events from them. We use these events to
characterize complex behavioral patterns of individuals
and communities over time. We show how semantic
information can be incorporated to reason about
community-behavior events. We also demonstrate the
application of behavioral patterns for the purposes of
modeling evolution, link prediction and influence
maximization. Finally, we present a diffusion model for
evolving networks, based on our framework.",
acknowledgement = ack-nhfb,
articleno = "16",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "diffusion of innovations; Dynamic interaction
networks; evolutionary analysis",
}
@Article{Chi:2009:ESC,
author = "Yun Chi and Xiaodan Song and Dengyong Zhou and Koji
Hino and Belle L. Tseng",
title = "On evolutionary spectral clustering",
journal = j-TKDD,
volume = "3",
number = "4",
pages = "17:1--17:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1631162.1631165",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:13 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Evolutionary clustering is an emerging research area
essential to important applications such as clustering
dynamic Web and blog contents and clustering data
streams. In evolutionary clustering, a good clustering
result should fit the current data well, while
simultaneously not deviate too dramatically from the
recent history. To fulfill this dual purpose, a measure
of {\em temporal smoothness\/} is integrated in the
overall measure of clustering quality. In this article,
we propose two frameworks that incorporate temporal
smoothness in evolutionary spectral clustering. For
both frameworks, we start with intuitions gained from
the well-known {\em k\/} -means clustering problem, and
then propose and solve corresponding cost functions for
the evolutionary spectral clustering problems. Our
solutions to the evolutionary spectral clustering
problems provide more stable and consistent clustering
results that are less sensitive to short-term noises
while at the same time are adaptive to long-term
cluster drifts. Furthermore, we demonstrate that our
methods provide the optimal solutions to the relaxed
versions of the corresponding evolutionary {\em k\/}
-means clustering problems. Performance experiments
over a number of real and synthetic data sets
illustrate our evolutionary spectral clustering methods
provide more robust clustering results that are not
sensitive to noise and can adapt to data drifts.",
acknowledgement = ack-nhfb,
articleno = "17",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Evolutionary spectral clustering; preserving cluster
membership; preserving cluster quality; temporal
smoothness",
}
@Article{Fujiwara:2009:FLS,
author = "Yasuhiro Fujiwara and Yasushi Sakurai and Masaru
Kitsuregawa",
title = "Fast likelihood search for hidden {Markov} models",
journal = j-TKDD,
volume = "3",
number = "4",
pages = "18:1--18:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1631162.1631166",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:13 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Hidden Markov models (HMMs) are receiving considerable
attention in various communities and many applications
that use HMMs have emerged such as mental task
classification, biological analysis, traffic
monitoring, and anomaly detection. This article has two
goals; The first goal is exact and efficient
identification of the model whose state sequence has
the highest likelihood for the given query sequence
(more precisely, no HMM that actually has a
high-probability path for the given sequence is missed
by the algorithm), and the second goal is exact and
efficient monitoring of streaming data sequences to
find the best model. We propose SPIRAL, a fast search
method for HMM datasets. SPIRAL is based on three
ideas; (1) it clusters states of models to compute
approximate likelihood, (2) it uses several
granularities and approximates likelihood values in
search processing, and (3) it focuses on just the
promising likelihood computations by pruning out
low-likelihood state sequences. Experiments verify the
effectiveness of SPIRAL and show that it is more than
490 times faster than the naive method.",
acknowledgement = ack-nhfb,
articleno = "18",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Hidden Markov model; likelihood; upper bound",
}
@Article{Zhang:2009:EAG,
author = "Xiang Zhang and Fei Zou and Wei Wang",
title = "Efficient algorithms for genome-wide association
study",
journal = j-TKDD,
volume = "3",
number = "4",
pages = "19:1--19:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1631162.1631167",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:13 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Studying the association between quantitative
phenotype (such as height or weight) and single
nucleotide polymorphisms (SNPs) is an important problem
in biology. To understand underlying mechanisms of
complex phenotypes, it is often necessary to consider
joint genetic effects across multiple SNPs. ANOVA
(analysis of variance) test is routinely used in
association study. Important findings from studying
gene-gene (SNP-pair) interactions are appearing in the
literature. However, the number of SNPs can be up to
millions. Evaluating joint effects of SNPs is a
challenging task even for SNP-pairs. Moreover, with
large number of SNPs correlated, permutation procedure
is preferred over simple Bonferroni correction for
properly controlling family-wise error rate and
retaining mapping power, which dramatically increases
the computational cost of association study.\par
In this article, we study the problem of finding
SNP-pairs that have significant associations with a
given quantitative phenotype. We propose an efficient
algorithm, FastANOVA, for performing ANOVA tests on
SNP-pairs in a batch mode, which also supports large
permutation test. We derive an upper bound of SNP-pair
ANOVA test, which can be expressed as the sum of two
terms. The first term is based on single-SNP ANOVA
test. The second term is based on the SNPs and
independent of any phenotype permutation. Furthermore,
SNP-pairs can be organized into groups, each of which
shares a common upper bound. This allows for maximum
reuse of intermediate computation, efficient upper
bound estimation, and effective SNP-pair pruning.
Consequently, FastANOVA only needs to perform the ANOVA
test on a small number of candidate SNP-pairs without
the risk of missing any significant ones. Extensive
experiments demonstrate that FastANOVA is orders of
magnitude faster than the brute-force implementation of
ANOVA tests on all SNP pairs. The principles used in
FastANOVA can be applied to categorical phenotypes and
other statistics such as Chi-square test.",
acknowledgement = ack-nhfb,
articleno = "19",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "ANOVA test; Association study; permutation test",
}
@Article{Bilgic:2009:RCM,
author = "Mustafa Bilgic and Lise Getoor",
title = "Reflect and correct: {A} misclassification prediction
approach to active inference",
journal = j-TKDD,
volume = "3",
number = "4",
pages = "20:1--20:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1631162.1631168",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:13 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Information diffusion, viral marketing, graph-based
semi-supervised learning, and collective classification
all attempt to model and exploit the relationships
among nodes in a network to improve the performance of
node labeling algorithms. However, sometimes the
advantage of exploiting the relationships can become a
disadvantage. Simple models like label propagation and
iterative classification can aggravate a
misclassification by propagating mistakes in the
network, while more complex models that define and
optimize a global objective function, such as Markov
random fields and graph mincuts, can misclassify a set
of nodes jointly. This problem can be mitigated if the
classification system is allowed to ask for the correct
labels for a few of the nodes during inference.
However, determining the optimal set of labels to
acquire is intractable under relatively general
assumptions, which forces us to resort to approximate
and heuristic techniques. We describe three such
techniques in this article. The first one is based on
directly approximating the value of the objective
function of label acquisition and greedily acquiring
the label that provides the most improvement. The
second technique is a simple technique based on the
analogy we draw between viral marketing and label
acquisition. Finally, we propose a method, which we
refer to as {\em reflect and correct}, that can learn
and predict when the classification system is likely to
make mistakes and suggests acquisitions to correct
those mistakes. We empirically show on a variety of
synthetic and real-world datasets that the reflect and
correct method significantly outperforms the other two
techniques, as well as other approaches based on
network structural measures such as node degree and
network clustering.",
acknowledgement = ack-nhfb,
articleno = "20",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Active inference; collective classification;
information diffusion; label acquisition; viral
marketing",
}
@Article{Kiernan:2009:CCS,
author = "Jerry Kiernan and Evimaria Terzi",
title = "Constructing comprehensive summaries of large event
sequences",
journal = j-TKDD,
volume = "3",
number = "4",
pages = "21:1--21:??",
month = nov,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1631162.1631169",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:13 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Event sequences capture system and user activity over
time. Prior research on sequence mining has mostly
focused on discovering local patterns appearing in a
sequence. While interesting, these patterns do not give
a comprehensive summary of the entire event sequence.
Moreover, the number of patterns discovered can be
large. In this article, we take an alternative approach
and build {\em short\/} summaries that describe an
entire sequence, and discover local dependencies
between event types.\par
We formally define the summarization problem as an
optimization problem that balances shortness of the
summary with accuracy of the data description. We show
that this problem can be solved optimally in polynomial
time by using a combination of two dynamic-programming
algorithms. We also explore more efficient greedy
alternatives and demonstrate that they work well on
large datasets. Experiments on both synthetic and real
datasets illustrate that our algorithms are efficient
and produce high-quality results, and reveal
interesting local structures in the data.",
acknowledgement = ack-nhfb,
articleno = "21",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Event sequences; log mining; summarization",
}
@Article{Koren:2010:FNS,
author = "Yehuda Koren",
title = "Factor in the neighbors: {Scalable} and accurate
collaborative filtering",
journal = j-TKDD,
volume = "4",
number = "1",
pages = "1:1--1:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1644873.1644874",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:37 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Recommender systems provide users with personalized
suggestions for products or services. These systems
often rely on collaborating filtering (CF), where past
transactions are analyzed in order to establish
connections between users and products. The most common
approach to CF is based on neighborhood models, which
originate from similarities between products or users.
In this work we introduce a new neighborhood model with
an improved prediction accuracy. Unlike previous
approaches that are based on heuristic similarities, we
model neighborhood relations by minimizing a global
cost function. Further accuracy improvements are
achieved by extending the model to exploit both
explicit and implicit feedback by the users. Past
models were limited by the need to compute all pairwise
similarities between items or users, which grow
quadratically with input size. In particular, this
limitation vastly complicates adopting user similarity
models, due to the typical large number of users. Our
new model solves these limitations by factoring the
neighborhood model, thus making both item-item and
user-user implementations scale linearly with the size
of the data. The methods are tested on the Netflix
data, with encouraging results.",
acknowledgement = ack-nhfb,
articleno = "1",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "collaborative filtering; Netflix Prize; Recommender
systems",
}
@Article{Syed:2010:MDP,
author = "Zeeshan Syed and Collin Stultz and Manolis Kellis and
Piotr Indyk and John Guttag",
title = "Motif discovery in physiological datasets: {A}
methodology for inferring predictive elements",
journal = j-TKDD,
volume = "4",
number = "1",
pages = "2:1--2:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1644873.1644875",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:37 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In this article, we propose a methodology for
identifying predictive physiological patterns in the
absence of prior knowledge. We use the principle of
conservation to identify activity that consistently
precedes an outcome in patients, and describe a
two-stage process that allows us to efficiently search
for such patterns in large datasets. This involves
first transforming continuous physiological signals
from patients into symbolic sequences, and then
searching for patterns in these reduced representations
that are strongly associated with an outcome.\par
Our strategy of identifying conserved activity that is
unlikely to have occurred purely by chance in symbolic
data is analogous to the discovery of regulatory motifs
in genomic datasets. We build upon existing work in
this area, generalizing the notion of a regulatory
motif and enhancing current techniques to operate
robustly on non-genomic data. We also address two
significant considerations associated with motif
discovery in general: computational efficiency and
robustness in the presence of degeneracy and noise. To
deal with these issues, we introduce the concept of
active regions and new subset-based techniques such as
a two-layer Gibbs sampling algorithm. These extensions
allow for a framework for information inference, where
precursors are identified as approximately conserved
activity of arbitrary complexity preceding multiple
occurrences of an event.\par
We evaluated our solution on a population of patients
who experienced sudden cardiac death and attempted to
discover electrocardiographic activity that may be
associated with the endpoint of death. To assess the
predictive patterns discovered, we compared likelihood
scores for motifs in the sudden death population
against control populations of normal individuals and
those with non-fatal supraventricular arrhythmias. Our
results suggest that predictive motif discovery may be
able to identify clinically relevant information even
in the absence of significant prior knowledge.",
acknowledgement = ack-nhfb,
articleno = "2",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "data mining; Gibbs sampling; inference; knowledge
discovery; motifs; physiological signals",
}
@Article{Webb:2010:SSI,
author = "Geoffrey I. Webb",
title = "Self-sufficient itemsets: {An} approach to screening
potentially interesting associations between items",
journal = j-TKDD,
volume = "4",
number = "1",
pages = "3:1--3:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1644873.1644876",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:37 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Self-sufficient itemsets are those whose frequency
cannot be explained solely by the frequency of either
their subsets or of their supersets. We argue that
itemsets that are not self-sufficient will often be of
little interest to the data analyst, as their frequency
should be expected once that of the itemsets on which
their frequency depends is known. We present tests for
statistically sound discovery of self-sufficient
itemsets, and computational techniques that allow those
tests to be applied as a post-processing step for any
itemset discovery algorithm. We also present a measure
for assessing the degree of potential interest in an
itemset that complements these statistical measures.",
acknowledgement = ack-nhfb,
articleno = "3",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Association discovery; association rules; itemset
discovery; itemset screening; statistical evaluation",
}
@Article{Plantevit:2010:MMM,
author = "Marc Plantevit and Anne Laurent and Dominique Laurent
and Maguelonne Teisseire and Yeow Wei Choong",
title = "Mining multidimensional and multilevel sequential
patterns",
journal = j-TKDD,
volume = "4",
number = "1",
pages = "4:1--4:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1644873.1644877",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:37 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Multidimensional databases have been designed to
provide decision makers with the necessary tools to
help them understand their data. This framework is
different from transactional data as the datasets
contain huge volumes of historicized and aggregated
data defined over a set of dimensions that can be
arranged through multiple levels of granularities. Many
tools have been proposed to query the data and navigate
through the levels of granularity. However, automatic
tools are still missing to mine this type of data in
order to discover regular specific patterns. In this
article, we present a method for mining sequential
patterns from multidimensional databases, at the same
time taking advantage of the different dimensions and
levels of granularity, which is original compared to
existing work. The necessary definitions and algorithms
are extended from regular sequential patterns to this
particular case. Experiments are reported, showing the
significance of this approach.",
acknowledgement = ack-nhfb,
articleno = "4",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "frequent patterns; hierarchy; multidimensional
databases; multilevel patterns; Sequential patterns",
}
@Article{Zaki:2010:VVO,
author = "Mohammed J. Zaki and Christopher D. Carothers and
Boleslaw K. Szymanski",
title = "{VOGUE}: {A} variable order hidden {Markov} model with
duration based on frequent sequence mining",
journal = j-TKDD,
volume = "4",
number = "1",
pages = "5:1--5:??",
month = jan,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1644873.1644878",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Tue Mar 16 18:37:37 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We present VOGUE, a novel, variable order hidden
Markov model with state durations, that combines two
separate techniques for modeling complex patterns in
sequential data: pattern mining and data modeling.
VOGUE relies on a variable gap sequence mining method
to extract frequent patterns with different lengths and
gaps between elements. It then uses these mined
sequences to build a variable order hidden Markov model
(HMM), that explicitly models the gaps. The gaps
implicitly model the order of the HMM, and they
explicitly model the duration of each state. We apply
VOGUE to a variety of real sequence data taken from
domains such as protein sequence classification, Web
usage logs, intrusion detection, and spelling
correction. We show that VOGUE has superior
classification accuracy compared to regular HMMs,
higher-order HMMs, and even special purpose HMMs like
HMMER, which is a state-of-the-art method for protein
classification. The VOGUE implementation and the
datasets used in this article are available as
open-source.$^1$",
acknowledgement = ack-nhfb,
articleno = "5",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "Hidden Markov models; higher-order HMM; HMM with
duration; sequence mining and modeling; variable-order
HMM",
}
@Article{Vadera:2010:CCS,
author = "Sunil Vadera",
title = "{CSNL}: {A} cost-sensitive non-linear decision tree
algorithm",
journal = j-TKDD,
volume = "4",
number = "2",
pages = "6:1--6:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1754428.1754429",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Sat Aug 14 17:12:30 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "This article presents a new decision tree learning
algorithm called CSNL that induces Cost-Sensitive
Non-Linear decision trees. The algorithm is based on
the hypothesis that nonlinear decision nodes provide a
better basis than axis-parallel decision nodes and
utilizes discriminant analysis to construct nonlinear
decision trees that take account of costs of
misclassification.\par
The performance of the algorithm is evaluated by
applying it to seventeen datasets and the results are
compared with those obtained by two well known
cost-sensitive algorithms, ICET and MetaCost, which
generate multiple trees to obtain some of the best
results to date. The results show that CSNL performs at
least as well, if not better than these algorithms, in
more than twelve of the datasets and is considerably
faster. The use of bagging with CSNL further enhances
its performance showing the significant benefits of
using nonlinear decision nodes.",
acknowledgement = ack-nhfb,
articleno = "6",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "cost-sensitive learning; Decision tree learning",
}
@Article{Kandylas:2010:AKC,
author = "Vasileios Kandylas and S. Phineas Upham and Lyle H.
Ungar",
title = "Analyzing knowledge communities using foreground and
background clusters",
journal = j-TKDD,
volume = "4",
number = "2",
pages = "7:1--7:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1754428.1754430",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Sat Aug 14 17:12:30 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Insight into the growth (or shrinkage) of ``knowledge
communities'' of authors that build on each other's
work can be gained by studying the evolution over time
of clusters of documents. We cluster documents based on
the documents they cite in common using the Streemer
clustering method, which finds cohesive foreground
clusters (the knowledge communities) embedded in a
diffuse background. We build predictive models with
features based on the citation structure, the
vocabulary of the papers, and the affiliations and
prestige of the authors and use these models to study
the drivers of community growth and the predictors of
how widely a paper will be cited. We find that
scientific knowledge communities tend to grow more
rapidly if their publications build on diverse
information and use narrow vocabulary and that papers
that lie on the periphery of a community have the
highest impact, while those not in any community have
the lowest impact.",
acknowledgement = ack-nhfb,
articleno = "7",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "citation analysis; clustering; community evolution;
knowledge communities; Text mining",
}
@Article{Ji:2010:SSL,
author = "Shuiwang Ji and Lei Tang and Shipeng Yu and Jieping
Ye",
title = "A shared-subspace learning framework for multi-label
classification",
journal = j-TKDD,
volume = "4",
number = "2",
pages = "8:1--8:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1754428.1754431",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Sat Aug 14 17:12:30 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Multi-label problems arise in various domains such as
multi-topic document categorization, protein function
prediction, and automatic image annotation. One natural
way to deal with such problems is to construct a binary
classifier for each label, resulting in a set of
independent binary classification problems. Since
multiple labels share the same input space, and the
semantics conveyed by different labels are usually
correlated, it is essential to exploit the correlation
information contained in different labels. In this
paper, we consider a general framework for extracting
shared structures in multi-label classification. In
this framework, a common subspace is assumed to be
shared among multiple labels. We show that the optimal
solution to the proposed formulation can be obtained by
solving a generalized eigenvalue problem, though the
problem is nonconvex. For high-dimensional problems,
direct computation of the solution is expensive, and we
develop an efficient algorithm for this case. One
appealing feature of the proposed framework is that it
includes several well-known algorithms as special
cases, thus elucidating their intrinsic relationships.
We further show that the proposed framework can be
extended to the kernel-induced feature space. We have
conducted extensive experiments on multi-topic web page
categorization and automatic gene expression pattern
image annotation tasks, and results demonstrate the
effectiveness of the proposed formulation in comparison
with several representative algorithms.",
acknowledgement = ack-nhfb,
articleno = "8",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "gene expression pattern image annotation; kernel
methods; least squares loss; Multi-label
classification; shared subspace; singular value
decomposition; web page categorization",
}
@Article{Ruggieri:2010:DMD,
author = "Salvatore Ruggieri and Dino Pedreschi and Franco
Turini",
title = "Data mining for discrimination discovery",
journal = j-TKDD,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2010",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1754428.1754432",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Sat Aug 14 17:12:30 MDT 2010",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In the context of civil rights law, discrimination
refers to unfair or unequal treatment of people based
on membership to a category or a minority, without
regard to individual merit. Discrimination in credit,
mortgage, insurance, labor market, and education has
been investigated by researchers in economics and human
sciences. With the advent of automatic decision support
systems, such as credit scoring systems, the ease of
data collection opens several challenges to data
analysts for the fight against discrimination. In this
article, we introduce the problem of discovering
discrimination through data mining in a dataset of
historical decision records, taken by humans or by
automatic systems. We formalize the processes of direct
and indirect discrimination discovery by modelling
protected-by-law groups and contexts where
discrimination occurs in a classification rule based
syntax. Basically, classification rules extracted from
the dataset allow for unveiling contexts of unlawful
discrimination, where the degree of burden over
protected-by-law groups is formalized by an extension
of the lift measure of a classification rule. In direct
discrimination, the extracted rules can be directly
mined in search of discriminatory contexts. In indirect
discrimination, the mining process needs some
background knowledge as a further input, for example,
census data, that combined with the extracted rules
might allow for unveiling contexts of discriminatory
decisions. A strategy adopted for combining extracted
classification rules with background knowledge is
called an inference model. In this article, we propose
two inference models and provide automatic procedures
for their implementation. An empirical assessment of
our results is provided on the German credit dataset
and on the PKDD Discovery Challenge 1999 financial
dataset.",
acknowledgement = ack-nhfb,
articleno = "9",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
keywords = "classification rules; Discrimination",
}
@Article{Thomas:2010:MMF,
author = "Lini T. Thomas and Satyanarayana R. Valluri and
Kamalakar Karlapalem",
title = "{MARGIN}: {Maximal} frequent subgraph mining",
journal = j-TKDD,
volume = "4",
number = "3",
pages = "10:1--10:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1839490.1839491",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:57 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Deodhar:2010:SFS,
author = "Meghana Deodhar and Joydeep Ghosh",
title = "{SCOAL}: {A} framework for simultaneous co-clustering
and learning from complex data",
journal = j-TKDD,
volume = "4",
number = "3",
pages = "11:1--11:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1839490.1839492",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:57 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chen:2010:BBI,
author = "Jinlin Chen and Keli Xiao",
title = "{BISC}: {A} bitmap itemset support counting approach
for efficient frequent itemset mining",
journal = j-TKDD,
volume = "4",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1839490.1839493",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:57 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Becchetti:2010:EAL,
author = "Luca Becchetti and Paolo Boldi and Carlos Castillo and
Aristides Gionis",
title = "Efficient algorithms for large-scale local triangle
counting",
journal = j-TKDD,
volume = "4",
number = "3",
pages = "13:1--13:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1839490.1839494",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:57 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhang:2010:MDR,
author = "Yin Zhang and Zhi-Hua Zhou",
title = "Multilabel dimensionality reduction via dependence
maximization",
journal = j-TKDD,
volume = "4",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1839490.1839495",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:57 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Cui:2010:LMN,
author = "Ying Cui and Xiaoli Z. Fern and Jennifer G. Dy",
title = "Learning multiple nonredundant clusterings",
journal = j-TKDD,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1839490.1839496",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:57 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2010:TSI,
author = "Wei Wang",
title = "{TKDD} Special Issue: {SIGKDD 2009}",
journal = j-TKDD,
volume = "4",
number = "4",
pages = "16:1--16:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1857947.1857948",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:58 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chen:2010:BTA,
author = "Ye Chen and Dmitry Pavlov and John F. Canny",
title = "Behavioral Targeting: The Art of Scaling Up Simple
Algorithms",
journal = j-TKDD,
volume = "4",
number = "4",
pages = "17:1--17:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1857947.1857949",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:58 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Mohammed:2010:CDA,
author = "Noman Mohammed and Benjamin C. M. Fung and Patrick C.
K. Hung and Cheuk-Kwong Lee",
title = "Centralized and Distributed Anonymization for
High-Dimensional Healthcare Data",
journal = j-TKDD,
volume = "4",
number = "4",
pages = "18:1--18:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1857947.1857950",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:58 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Liu:2010:BBM,
author = "Chao Liu and Fan Guo and Christos Faloutsos",
title = "{Bayesian} Browsing Model: Exact Inference of Document
Relevance from Petabyte-Scale Data",
journal = j-TKDD,
volume = "4",
number = "4",
pages = "19:1--19:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1857947.1857951",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:58 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wu:2010:MAF,
author = "Mingxi Wu and Chris Jermaine and Sanjay Ranka and
Xiuyao Song and John Gums",
title = "A Model-Agnostic Framework for Fast Spatial Anomaly
Detection",
journal = j-TKDD,
volume = "4",
number = "4",
pages = "20:1--20:??",
month = oct,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1857947.1857952",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:58 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhong:2010:ATS,
author = "Ning Zhong and Gregory Piatetsky-Shapiro and Yiyu Yao
and Philip S. Yu",
title = "{ACM TKDD} Special Issue on Knowledge Discovery for
{Web} Intelligence",
journal = j-TKDD,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1870096.1870097",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:59 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Tang:2010:CAW,
author = "Jie Tang and Limin Yao and Duo Zhang and Jing Zhang",
title = "A Combination Approach to {Web} User Profiling",
journal = j-TKDD,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1870096.1870098",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:59 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Bouguessa:2010:DKS,
author = "Mohamed Bouguessa and Shengrui Wang and Benoit
Dumoulin",
title = "Discovering Knowledge-Sharing Communities in
Question-Answering Forums",
journal = j-TKDD,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1870096.1870099",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:59 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Plangprasopchok:2010:MSA,
author = "Anon Plangprasopchok and Kristina Lerman",
title = "Modeling Social Annotation: {A} {Bayesian} Approach",
journal = j-TKDD,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1870096.1870100",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:59 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Sakurai:2010:FDG,
author = "Yasushi Sakurai and Christos Faloutsos and Spiros
Papadimitriou",
title = "Fast Discovery of Group Lag Correlations in Streams",
journal = j-TKDD,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1870096.1870101",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:59 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Liu:2010:FCP,
author = "Kun Liu and Evimaria Terzi",
title = "A Framework for Computing the Privacy Scores of Users
in Online Social Networks",
journal = j-TKDD,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1870096.1870102",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:43:59 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Sun:2011:ISI,
author = "Jimeng Sun and Yan Liu and Jie Tang and Chid Apte",
title = "Introduction to Special Issue on Large-Scale Data
Mining",
journal = j-TKDD,
volume = "5",
number = "2",
pages = "7:1--7:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1921632.1921633",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:44:01 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Kang:2011:HMR,
author = "U. Kang and Charalampos E. Tsourakakis and Ana Paula
Appel and Christos Faloutsos and Jure Leskovec",
title = "{HADI}: Mining Radii of Large Graphs",
journal = j-TKDD,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1921632.1921634",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:44:01 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{deVries:2011:RRL,
author = "Timothy de Vries and Hui Ke and Sanjay Chawla and
Peter Christen",
title = "Robust Record Linkage Blocking Using Suffix Arrays and
{Bloom} Filters",
journal = j-TKDD,
volume = "5",
number = "2",
pages = "9:1--9:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1921632.1921635",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:44:01 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Dunlavy:2011:TLP,
author = "Daniel M. Dunlavy and Tamara G. Kolda and Evrim Acar",
title = "Temporal Link Prediction Using Matrix and Tensor
Factorizations",
journal = j-TKDD,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1921632.1921636",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:44:01 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Magdalinos:2011:ECQ,
author = "Panagis Magdalinos and Christos Doulkeridis and
Michalis Vazirgiannis",
title = "Enhancing Clustering Quality through Landmark-Based
Dimensionality Reduction",
journal = j-TKDD,
volume = "5",
number = "2",
pages = "11:1--11:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1921632.1921637",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:44:01 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Cheng:2011:CLA,
author = "Hong Cheng and Yang Zhou and Jeffrey Xu Yu",
title = "Clustering Large Attributed Graphs: {A} Balance
between Structural and Attribute Similarities",
journal = j-TKDD,
volume = "5",
number = "2",
pages = "12:1--12:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1921632.1921638",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:44:01 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Menon:2011:FAA,
author = "Aditya Krishna Menon and Charles Elkan",
title = "Fast Algorithms for Approximating the Singular Value
Decomposition",
journal = j-TKDD,
volume = "5",
number = "2",
pages = "13:1--13:??",
month = feb,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1921632.1921639",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Mon Mar 28 11:44:01 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "A low-rank approximation to a matrix $A$ is a matrix
with significantly smaller rank than $A$, and which is
close to $A$ according to some norm. Many practical
applications involving the use of large matrices focus
on low-rank approximations. By reducing the rank or
dimensionality of the data, we reduce the complexity of
analyzing the data. The singular value decomposition is
the most popular low-rank matrix approximation.
However, due to its expensive computational
requirements, it has often been considered intractable
for practical applications involving massive data.
Recent developments have tried to address this problem,
with several methods proposed to approximate the
decomposition with better asymptotic runtime. We
present an empirical study of these techniques on a
variety of dense and sparse datasets. We find that a
sampling approach of Drineas, Kannan and Mahoney is
often, but not always, the best performing method. This
method gives solutions with high accuracy much faster
than classical SVD algorithms, on large sparse datasets
in particular. Other modern methods, such as a recent
algorithm by Rokhlin and Tygert, also offer savings
compared to classical SVD algorithms. The older
sampling methods of Achlioptas and McSherry are shown
to sometimes take longer than classical SVD.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2011:IDC,
author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yun Chi
and Yihong Gong",
title = "Integrating Document Clustering and Multidocument
Summarization",
journal = j-TKDD,
volume = "5",
number = "3",
pages = "14:1--14:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1993077.1993078",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Thu Aug 18 13:28:08 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Maier:2011:INS,
author = "Marc Maier and Matthew Rattigan and David Jensen",
title = "Indexing Network Structure with Shortest-Path Trees",
journal = j-TKDD,
volume = "5",
number = "3",
pages = "15:1--15:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1993077.1993079",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Thu Aug 18 13:28:08 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wong:2011:CUA,
author = "Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Ke Wang
and Philip S. Yu and Jian Pei",
title = "Can the Utility of Anonymized Data be Used for Privacy
Breaches?",
journal = j-TKDD,
volume = "5",
number = "3",
pages = "16:1--16:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1993077.1993080",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Thu Aug 18 13:28:08 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Lin:2011:CDM,
author = "Yu-Ru Lin and Jimeng Sun and Hari Sundaram and Aisling
Kelliher and Paul Castro and Ravi Konuru",
title = "Community Discovery via Metagraph Factorization",
journal = j-TKDD,
volume = "5",
number = "3",
pages = "17:1--17:??",
month = aug,
year = "2011",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/1993077.1993081",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
bibdate = "Thu Aug 18 13:28:08 MDT 2011",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Elkan:2012:GES,
author = "Charles Elkan and Yehuda Koren",
title = "Guest Editorial for Special Issue {KDD'10}",
journal = j-TKDD,
volume = "5",
number = "4",
pages = "18:1--18:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2086737.2086738",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 16 15:19:57 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Iwata:2012:SMT,
author = "Tomoharu Iwata and Takeshi Yamada and Yasushi Sakurai
and Naonori Ueda",
title = "Sequential Modeling of Topic Dynamics with Multiple
Timescales",
journal = j-TKDD,
volume = "5",
number = "4",
pages = "19:1--19:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2086737.2086739",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 16 15:19:57 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We propose an online topic model for sequentially
analyzing the time evolution of topics in document
collections. Topics naturally evolve with multiple
timescales. For example, some words may be used
consistently over one hundred years, while other words
emerge and disappear over periods of a few days. Thus,
in the proposed model, current topic-specific
distributions over words are assumed to be generated
based on the multiscale word distributions of the
previous epoch. Considering both the long- and
short-timescale dependency yields a more robust model.
We derive efficient online inference procedures based
on a stochastic EM algorithm, in which the model is
sequentially updated using newly obtained data; this
means that past data are not required to make the
inference. We demonstrate the effectiveness of the
proposed method in terms of predictive performance and
computational efficiency by examining collections of
real documents with timestamps.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Huh:2012:DTM,
author = "Seungil Huh and Stephen E. Fienberg",
title = "Discriminative Topic Modeling Based on Manifold
Learning",
journal = j-TKDD,
volume = "5",
number = "4",
pages = "20:1--20:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2086737.2086740",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 16 15:19:57 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Topic modeling has become a popular method used for
data analysis in various domains including text
documents. Previous topic model approaches, such as
probabilistic Latent Semantic Analysis (pLSA) and
Latent Dirichlet Allocation (LDA), have shown
impressive success in discovering low-rank hidden
structures for modeling text documents. These
approaches, however do not take into account the
manifold structure of the data, which is generally
informative for nonlinear dimensionality reduction
mapping. More recent topic model approaches, Laplacian
PLSI (LapPLSI) and Locally-consistent Topic Model
(LTM), have incorporated the local manifold structure
into topic models and have shown resulting benefits.
But they fall short of achieving full discriminating
power of manifold learning as they only enhance the
proximity between the low-rank representations of
neighboring pairs without any consideration for
non-neighboring pairs. In this article, we propose a
new approach, Discriminative Topic Model (DTM), which
separates non-neighboring pairs from each other in
addition to bringing neighboring pairs closer together,
thereby preserving the global manifold structure as
well as improving local consistency. We also present a
novel model-fitting algorithm based on the generalized
EM algorithm and the concept of Pareto improvement. We
empirically demonstrate the success of DTM in terms of
unsupervised clustering and semisupervised
classification accuracies on text corpora and
robustness to parameters compared to state-of-the-art
techniques.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Gomez-Rodriguez:2012:IND,
author = "Manuel Gomez-Rodriguez and Jure Leskovec and Andreas
Krause",
title = "Inferring Networks of Diffusion and Influence",
journal = j-TKDD,
volume = "5",
number = "4",
pages = "21:1--21:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2086737.2086741",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 16 15:19:57 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Information diffusion and virus propagation are
fundamental processes taking place in networks. While
it is often possible to directly observe when nodes
become infected with a virus or publish the
information, observing individual transmissions (who
infects whom, or who influences whom) is typically very
difficult. Furthermore, in many applications, the
underlying network over which the diffusions and
propagations spread is actually unobserved. We tackle
these challenges by developing a method for tracing
paths of diffusion and influence through networks and
inferring the networks over which contagions propagate.
Given the times when nodes adopt pieces of information
or become infected, we identify the optimal network
that best explains the observed infection times. Since
the optimization problem is NP-hard to solve exactly,
we develop an efficient approximation algorithm that
scales to large datasets and finds provably
near-optimal networks. We demonstrate the effectiveness
of our approach by tracing information diffusion in a
set of 170 million blogs and news articles over a one
year period to infer how information flows through the
online media space. We find that the diffusion network
of news for the top 1,000 media sites and blogs tends
to have a core-periphery structure with a small set of
core media sites that diffuse information to the rest
of the Web. These sites tend to have stable circles of
influence with more general news media sites acting as
connectors between them.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chen:2012:LIS,
author = "Jianhui Chen and Ji Liu and Jieping Ye",
title = "Learning Incoherent Sparse and Low-Rank Patterns from
Multiple Tasks",
journal = j-TKDD,
volume = "5",
number = "4",
pages = "22:1--22:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2086737.2086742",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 16 15:19:57 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We consider the problem of learning incoherent sparse
and low-rank patterns from multiple tasks. Our approach
is based on a linear multitask learning formulation, in
which the sparse and low-rank patterns are induced by a
cardinality regularization term and a low-rank
constraint, respectively. This formulation is
nonconvex; we convert it into its convex surrogate,
which can be routinely solved via semidefinite
programming for small-size problems. We propose
employing the general projected gradient scheme to
efficiently solve such a convex surrogate; however, in
the optimization formulation, the objective function is
nondifferentiable and the feasible domain is
nontrivial. We present the procedures for computing the
projected gradient and ensuring the global convergence
of the projected gradient scheme. The computation of
the projected gradient involves a constrained
optimization problem; we show that the optimal solution
to such a problem can be obtained via solving an
unconstrained optimization subproblem and a Euclidean
projection subproblem. We also present two projected
gradient algorithms and analyze their rates of
convergence in detail. In addition, we illustrate the
use of the presented projected gradient algorithms for
the proposed multitask learning formulation using the
least squares loss. Experimental results on a
collection of real-world data sets demonstrate the
effectiveness of the proposed multitask learning
formulation and the efficiency of the proposed
projected gradient algorithms.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Yu:2012:LLC,
author = "Hsiang-Fu Yu and Cho-Jui Hsieh and Kai-Wei Chang and
Chih-Jen Lin",
title = "Large Linear Classification When Data Cannot Fit in
Memory",
journal = j-TKDD,
volume = "5",
number = "4",
pages = "23:1--23:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2086737.2086743",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 16 15:19:57 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Recent advances in linear classification have shown
that for applications such as document classification,
the training process can be extremely efficient.
However, most of the existing training methods are
designed by assuming that data can be stored in the
computer memory. These methods cannot be easily applied
to data larger than the memory capacity due to the
random access to the disk. We propose and analyze a
block minimization framework for data larger than the
memory size. At each step a block of data is loaded
from the disk and handled by certain learning methods.
We investigate two implementations of the proposed
framework for primal and dual SVMs, respectively.
Because data cannot fit in memory, many design
considerations are very different from those for
traditional algorithms. We discuss and compare with
existing approaches that are able to handle data larger
than memory. Experiments using data sets 20 times
larger than the memory demonstrate the effectiveness of
the proposed method.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Shahaf:2012:CTL,
author = "Dafna Shahaf and Carlos Guestrin",
title = "Connecting Two (or Less) Dots: Discovering Structure
in News Articles",
journal = j-TKDD,
volume = "5",
number = "4",
pages = "24:1--24:??",
month = feb,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2086737.2086744",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 16 15:19:57 MDT 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Finding information is becoming a major part of our
daily life. Entire sectors, from Web users to
scientists and intelligence analysts, are increasingly
struggling to keep up with the larger and larger
amounts of content published every day. With this much
data, it is often easy to miss the big picture. In this
article, we investigate methods for automatically
connecting the dots---providing a structured, easy way
to navigate within a new topic and discover hidden
connections. We focus on the news domain: given two
news articles, our system automatically finds a
coherent chain linking them together. For example, it
can recover the chain of events starting with the
decline of home prices (January 2007), and ending with
the health care debate (2009). We formalize the
characteristics of a good chain and provide a fast
search-driven algorithm to connect two fixed endpoints.
We incorporate user feedback into our framework,
allowing the stories to be refined and personalized. We
also provide a method to handle partially-specified
endpoints, for users who do not know both ends of a
story. Finally, we evaluate our algorithm over real
news data. Our user studies demonstrate that the
objective we propose captures the users' intuitive
notion of coherence, and that our algorithm effectively
helps users understand the news.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ienco:2012:CDL,
author = "Dino Ienco and Ruggero G. Pensa and Rosa Meo",
title = "From Context to Distance: Learning Dissimilarity for
Categorical Data Clustering",
journal = j-TKDD,
volume = "6",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2133360.2133361",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Clustering data described by categorical attributes is
a challenging task in data mining applications. Unlike
numerical attributes, it is difficult to define a
distance between pairs of values of a categorical
attribute, since the values are not ordered. In this
article, we propose a framework to learn a
context-based distance for categorical attributes. The
key intuition of this work is that the distance between
two values of a categorical attribute A$_i$ can be
determined by the way in which the values of the other
attributes A$_j$ are distributed in the dataset
objects: if they are similarly distributed in the
groups of objects in correspondence of the distinct
values of A$_i$ a low value of distance is obtained. We
propose also a solution to the critical point of the
choice of the attributes A$_j$. We validate our
approach by embedding our distance learning framework
in a hierarchical clustering algorithm. We applied it
on various real world and synthetic datasets, both low
and high-dimensional. Experimental results show that
our method is competitive with respect to the state of
the art of categorical data clustering approaches. We
also show that our approach is scalable and has a low
impact on the overall computational time of a
clustering task.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Li:2012:EMG,
author = "Chun Li and Qingyan Yang and Jianyong Wang and Ming
Li",
title = "Efficient Mining of Gap-Constrained Subsequences and
Its Various Applications",
journal = j-TKDD,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2133360.2133362",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Mining frequent subsequence patterns is a typical
data-mining problem and various efficient sequential
pattern mining algorithms have been proposed. In many
application domains (e.g., biology), the frequent
subsequences confined by the predefined gap
requirements are more meaningful than the general
sequential patterns. In this article, we propose two
algorithms, Gap-BIDE for mining closed gap-constrained
subsequences from a set of input sequences, and
Gap-Connect for mining repetitive gap-constrained
subsequences from a single input sequence. Inspired by
some state-of-the-art closed or constrained sequential
pattern mining algorithms, the Gap-BIDE algorithm
adopts an efficient approach to finding the complete
set of closed sequential patterns with gap constraints,
while the Gap-Connect algorithm efficiently mines an
approximate set of long patterns by connecting short
patterns. We also present several methods for feature
selection from the set of gap-constrained patterns for
the purpose of classification and clustering. Our
extensive performance study shows that our approaches
are very efficient in mining frequent subsequences with
gap constraints, and the gap-constrained pattern based
classification/clustering approaches can achieve
high-quality results.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Liu:2012:IBA,
author = "Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou",
title = "Isolation-Based Anomaly Detection",
journal = j-TKDD,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2133360.2133363",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Anomalies are data points that are few and different.
As a result of these properties, we show that,
anomalies are susceptible to a mechanism called
isolation. This article proposes a method called
Isolation Forest ($i$ Forest), which detects anomalies
purely based on the concept of isolation without
employing any distance or density
measure---fundamentally different from all existing
methods. As a result, $i$ Forest is able to exploit
subsampling (i) to achieve a low linear time-complexity
and a small memory-requirement and (ii) to deal with
the effects of swamping and masking effectively. Our
empirical evaluation shows that $i$ Forest outperforms
ORCA, one-class SVM, LOF and Random Forests in terms of
AUC, processing time, and it is robust against masking
and swamping effects. $i$ Forest also works well in
high dimensional problems containing a large number of
irrelevant attributes, and when anomalies are not
available in training sample.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Jin:2012:MML,
author = "Yu Jin and Nick Duffield and Jeffrey Erman and Patrick
Haffner and Subhabrata Sen and Zhi-Li Zhang",
title = "A Modular Machine Learning System for Flow-Level
Traffic Classification in Large Networks",
journal = j-TKDD,
volume = "6",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2133360.2133364",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The ability to accurately and scalably classify
network traffic is of critical importance to a wide
range of management tasks of large networks, such as
tier-1 ISP networks and global enterprise networks.
Guided by the practical constraints and requirements of
traffic classification in large networks, in this
article, we explore the design of an accurate and
scalable machine learning based flow-level traffic
classification system, which is trained on a dataset of
flow-level data that has been annotated with
application protocol labels by a packet-level
classifier. Our system employs a lightweight modular
architecture, which combines a series of simple linear
binary classifiers, each of which can be efficiently
implemented and trained on vast amounts of flow data in
parallel, and embraces three key innovative mechanisms,
weighted threshold sampling, logistic calibration, and
intelligent data partitioning, to achieve scalability
while attaining high accuracy. Evaluations using real
traffic data from multiple locations in a large ISP
show that our system accurately reproduces the labels
of the packet level classifier when runs on (unlabeled)
flow records, while meeting the scalability and
stability requirements of large ISP networks. Using
training and test datasets that are two months apart
and collected from two different locations, the flow
error rates are only 3\% for TCP flows and 0.4\% for
UDP flows. We further show that such error rates can be
reduced by combining the information of spatial
distributions of flows, or collective traffic
statistics, during classification. We propose a novel
two-step model, which seamlessly integrates these
collective traffic statistics into the existing traffic
classification system. Experimental results display
performance improvement on all traffic classes and an
overall error rate reduction by 15\%. In addition to a
high accuracy, at runtime, our implementation easily
scales to classify traffic on 10Gbps links.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Mavroeidis:2012:SSF,
author = "Dimitrios Mavroeidis and Panagis Magdalinos",
title = "A Sequential Sampling Framework for Spectral $k$-Means
Based on Efficient Bootstrap Accuracy Estimations:
Application to Distributed Clustering",
journal = j-TKDD,
volume = "6",
number = "2",
pages = "5:1--5:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2297456.2297457",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The scalability of learning algorithms has always been
a central concern for data mining researchers, and
nowadays, with the rapid increase in data storage
capacities and availability, its importance has
increased. To this end, sampling has been studied by
several researchers in an effort to derive sufficiently
accurate models using only small data fractions. In
this article we focus on spectral $k$-means, that is,
the $k$-means approximation as derived by the spectral
relaxation, and propose a sequential sampling framework
that iteratively enlarges the sample size until the
$k$-means results (objective function and cluster
structure) become indistinguishable from the asymptotic
(infinite-data) output. In the proposed framework we
adopt a commonly applied principle in data mining
research that considers the use of minimal assumptions
concerning the data generating distribution. This
restriction imposes several challenges, mainly related
to the efficiency of the sequential sampling procedure.
These challenges are addressed using elements of matrix
perturbation theory and statistics. Moreover, although
the main focus is on spectral $k$-means, we also
demonstrate that the proposed framework can be
generalized to handle spectral clustering. The proposed
sequential sampling framework is consecutively employed
for addressing the distributed clustering problem,
where the task is to construct a global model for data
that resides in distributed network nodes. The main
challenge in this context is related to the bandwidth
constraints that are commonly imposed, thus requiring
that the distributed clustering algorithm consumes a
minimal amount of network load. This illustrates the
applicability of the proposed approach, as it enables
the determination of a minimal sample size that can be
used for constructing an accurate clustering model that
entails the distributional characteristics of the data.
As opposed to the relevant distributed $k$-means
approaches, our framework takes into account the fact
that the choice of the number of clusters has a crucial
effect on the required amount of communication. More
precisely, the proposed algorithm is able to derive a
statistical estimation of the required relative sizes
for all possible values of $k$. This unique feature of
our distributed clustering framework enables a network
administrator to choose an economic solution that
identifies the crude cluster structure of a dataset and
not devote excessive network resources for identifying
all the ``correct'' detailed clusters.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Das:2012:MIG,
author = "Sanmay Das and Malik Magdon-Ismail",
title = "A Model for Information Growth in Collective Wisdom
Processes",
journal = j-TKDD,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2297456.2297458",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Collaborative media such as wikis have become
enormously successful venues for information creation.
Articles accrue information through the asynchronous
editing of users who arrive both seeking information
and possibly able to contribute information. Most
articles stabilize to high-quality, trusted sources of
information representing the collective wisdom of all
the users who edited the article. We propose a model
for information growth which relies on two main
observations: (i) as an article's quality improves, it
attracts visitors at a faster rate (a rich-get-richer
phenomenon); and, simultaneously, (ii) the chances that
a new visitor will improve the article drops (there is
only so much that can be said about a particular
topic). Our model is able to reproduce many features of
the edit dynamics observed on Wikipedia; in particular,
it captures the observed rise in the edit rate,
followed by $1/ t$ decay. Despite differences in the
media, we also document similar features in the comment
rates for a segment of the LiveJournal blogosphere.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Xu:2012:GME,
author = "Tianbing Xu and Zhongfei Zhang and Philip S. Yu and Bo
Long",
title = "Generative Models for Evolutionary Clustering",
journal = j-TKDD,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2297456.2297459",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "This article studies evolutionary clustering, a
recently emerged hot topic with many important
applications, noticeably in dynamic social network
analysis. In this article, based on the recent
literature on nonparametric Bayesian models, we have
developed two generative models: DPChain and HDP-HTM.
DPChain is derived from the Dirichlet process mixture
(DPM) model, with an exponential decaying component
along with the time. HDP-HTM combines the hierarchical
dirichlet process (HDP) with a hierarchical transition
matrix (HTM) based on the proposed Infinite
hierarchical Markov state model (iHMS). Both models
substantially advance the literature on evolutionary
clustering, in the sense that not only do they both
perform better than those in the existing literature,
but more importantly, they are capable of automatically
learning the cluster numbers and explicitly addressing
the corresponding issues. Extensive evaluations have
demonstrated the effectiveness and the promise of these
two solutions compared to the state-of-the-art
literature.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2012:LME,
author = "Shaojun Wang and Dale Schuurmans and Yunxin Zhao",
title = "The Latent Maximum Entropy Principle",
journal = j-TKDD,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2297456.2297460",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We present an extension to Jaynes' maximum entropy
principle that incorporates latent variables. The
principle of latent maximum entropy we propose is
different from both Jaynes' maximum entropy principle
and maximum likelihood estimation, but can yield better
estimates in the presence of hidden variables and
limited training data. We first show that solving for a
latent maximum entropy model poses a hard nonlinear
constrained optimization problem in general. However,
we then show that feasible solutions to this problem
can be obtained efficiently for the special case of
log-linear models---which forms the basis for an
efficient approximation to the latent maximum entropy
principle. We derive an algorithm that combines
expectation-maximization with iterative scaling to
produce feasible log-linear solutions. This algorithm
can be interpreted as an alternating minimization
algorithm in the information divergence, and reveals an
intimate connection between the latent maximum entropy
and maximum likelihood principles. To select a final
model, we generate a series of feasible candidates,
calculate the entropy of each, and choose the model
that attains the highest entropy. Our experimental
results show that estimation based on the latent
maximum entropy principle generally gives better
results than maximum likelihood when estimating latent
variable models on small observed data samples.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Bhattacharya:2012:CGC,
author = "Indrajit Bhattacharya and Shantanu Godbole and
Sachindra Joshi and Ashish Verma",
title = "Cross-Guided Clustering: Transfer of Relevant
Supervision across Tasks",
journal = j-TKDD,
volume = "6",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2297456.2297461",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:38 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Lack of supervision in clustering algorithms often
leads to clusters that are not useful or interesting to
human reviewers. We investigate if supervision can be
automatically transferred for clustering a target task,
by providing a relevant supervised partitioning of a
dataset from a different source task. The target
clustering is made more meaningful for the human user
by trading-off intrinsic clustering goodness on the
target task for alignment with relevant supervised
partitions in the source task, wherever possible. We
propose a cross-guided clustering algorithm that builds
on traditional k-means by aligning the target clusters
with source partitions. The alignment process makes use
of a cross-task similarity measure that discovers
hidden relationships across tasks. When the source and
target tasks correspond to different domains with
potentially different vocabularies, we propose a
projection approach using pivot vocabularies for the
cross-domain similarity measure. Using multiple
real-world and synthetic datasets, we show that our
approach improves clustering accuracy significantly
over traditional k-means and state-of-the-art
semi-supervised clustering baselines, over a wide range
of data characteristics and parameter settings.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2012:LBN,
author = "Zhenxing Wang and Laiwan Chan",
title = "Learning {Bayesian} networks from {Markov} random
fields: an efficient algorithm for linear models",
journal = j-TKDD,
volume = "6",
number = "3",
pages = "10:1--10:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2362383.2362384",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:40 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Dependency analysis is a typical approach for Bayesian
network learning, which infers the structures of
Bayesian networks by the results of a series of
conditional independence (CI) tests. In practice,
testing independence conditioning on large sets hampers
the performance of dependency analysis algorithms in
terms of accuracy and running time for the following
reasons. First, testing independence on large sets of
variables with limited samples is not stable. Second,
for most dependency analysis algorithms, the number of
CI tests grows at an exponential rate with the sizes of
conditioning sets, and the running time grows of the
same rate. Therefore, determining how to reduce the
number of CI tests and the sizes of conditioning sets
becomes a critical step in dependency analysis
algorithms. In this article, we address a two-phase
algorithm based on the observation that the structures
of Markov random fields are similar to those of
Bayesian networks. The first phase of the algorithm
constructs a Markov random field from data, which
provides a close approximation to the structure of the
true Bayesian network; the second phase of the
algorithm removes redundant edges according to CI tests
to get the true Bayesian network. Both phases use
Markov blanket information to reduce the sizes of
conditioning sets and the number of CI tests without
sacrificing accuracy. An empirical study shows that the
two-phase algorithm performs well in terms of accuracy
and efficiency.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chan:2012:CID,
author = "Jeffrey Chan and James Bailey and Christopher Leckie
and Michael Houle",
title = "{ciForager}: Incrementally discovering regions of
correlated change in evolving graphs",
journal = j-TKDD,
volume = "6",
number = "3",
pages = "11:1--11:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2362383.2362385",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:40 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Data mining techniques for understanding how graphs
evolve over time have become increasingly important.
Evolving graphs arise naturally in diverse applications
such as computer network topologies, multiplayer games
and medical imaging. A natural and interesting problem
in evolving graph analysis is the discovery of compact
subgraphs that change in a similar manner. Such
subgraphs are known as regions of correlated change and
they can both summarise change patterns in graphs and
help identify the underlying events causing these
changes. However, previous techniques for discovering
regions of correlated change suffer from limited
scalability, making them unsuitable for analysing the
evolution of very large graphs. In this paper, we
introduce a new algorithm called ciForager, that
addresses this scalability challenge and offers
considerable improvements. The efficiency of ciForager
is based on the use of new incremental techniques for
detecting change, as well as the use of Voronoi
representations for efficiently determining distance.
We experimentally show that ciForager can achieve
speedups of up to 1000 times over previous approaches.
As a result, it becomes feasible for the first time to
discover regions of correlated change in extremely
large graphs, such as the entire BGP routing topology
of the Internet.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2012:CDS,
author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong
Gong",
title = "Comparative document summarization via discriminative
sentence selection",
journal = j-TKDD,
volume = "6",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2362383.2362386",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:40 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Given a collection of document groups, a natural
question is to identify the differences among them.
Although traditional document summarization techniques
can summarize the content of the document groups one by
one, there exists a great necessity to generate a
summary of the differences among the document groups.
In this article, we study a novel problem, that of
summarizing the differences between document groups. A
discriminative sentence selection method is proposed to
extract the most discriminative sentences which
represent the specific characteristics of each document
group. Experiments and case studies on real-world data
sets demonstrate the effectiveness of our proposed
method.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{deMelo:2012:FNO,
author = "Pedro O. S. {Vaz de Melo} and Virgilio A. F. Almeida
and Antonio A. F. Loureiro and Christos Faloutsos",
title = "Forecasting in the {NBA} and other team sports:
Network effects in action",
journal = j-TKDD,
volume = "6",
number = "3",
pages = "13:1--13:??",
month = oct,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2362383.2362387",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Nov 6 18:30:40 MST 2012",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The multi-million sports-betting market is based on
the fact that the task of predicting the outcome of a
sports event is very hard. Even with the aid of an
uncountable number of descriptive statistics and
background information, only a few can correctly guess
the outcome of a game or a league. In this work, our
approach is to move away from the traditional way of
predicting sports events, and instead to model sports
leagues as networks of players and teams where the only
information available is the work relationships among
them. We propose two network-based models to predict
the behavior of teams in sports leagues. These models
are parameter-free, that is, they do not have a single
parameter, and moreover are sport-agnostic: they can be
applied directly to any team sports league. First, we
view a sports league as a network in evolution, and we
infer the implicit feedback behind network changes and
properties over the years. Then, we use this knowledge
to construct the network-based prediction models, which
can, with a significantly high probability, indicate
how well a team will perform over a season. We compare
our proposed models with other prediction models in two
of the most popular sports leagues: the National
Basketball Association (NBA) and the Major League
Baseball (MLB). Our model shows consistently good
results in comparison with the other models and,
relying upon the network properties of the teams, we
achieved a $\approx 14\%$ rank prediction accuracy
improvement over our best competitor.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ghosh:2012:SIB,
author = "Joydeep Ghosh and Padhraic Smyth and Andrew Tomkins
and Rich Caruana",
title = "Special issue on best of {SIGKDD 2011}",
journal = j-TKDD,
volume = "6",
number = "4",
pages = "14:1--14:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2382577.2382578",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:40 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Kaufman:2012:LDM,
author = "Shachar Kaufman and Saharon Rosset and Claudia Perlich
and Ori Stitelman",
title = "Leakage in data mining: Formulation, detection, and
avoidance",
journal = j-TKDD,
volume = "6",
number = "4",
pages = "15:1--15:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2382577.2382579",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:40 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Deemed ``one of the top ten data mining mistakes'',
leakage is the introduction of information about the
data mining target that should not be legitimately
available to mine from. In addition to our own industry
experience with real-life projects, controversies
around several major public data mining competitions
held recently such as the INFORMS 2010 Data Mining
Challenge and the IJCNN 2011 Social Network Challenge
are evidence that this issue is as relevant today as it
has ever been. While acknowledging the importance and
prevalence of leakage in both synthetic competitions
and real-life data mining projects, existing literature
has largely left this idea unexplored. What little has
been said turns out not to be broad enough to cover
more complex cases of leakage, such as those where the
classical independently and identically distributed
(i.i.d.) assumption is violated, that have been
recently documented. In our new approach, these cases
and others are explained by explicitly defining
modeling goals and analyzing the broader framework of
the data mining problem. The resulting definition
enables us to derive general methodology for dealing
with the issue. We show that it is possible to avoid
leakage with a simple specific approach to data
management followed by what we call a learn-predict
separation, and present several ways of detecting
leakage when the modeler has no control over how the
data have been collected. We also offer an alternative
point of view on leakage that is based on causal graph
modeling concepts.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Mampaey:2012:SDS,
author = "Michael Mampaey and Jilles Vreeken and Nikolaj Tatti",
title = "Summarizing data succinctly with the most informative
itemsets",
journal = j-TKDD,
volume = "6",
number = "4",
pages = "16:1--16:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2382577.2382580",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:40 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Knowledge discovery from data is an inherently
iterative process. That is, what we know about the data
greatly determines our expectations, and therefore,
what results we would find interesting and/or
surprising. Given new knowledge about the data, our
expectations will change. Hence, in order to avoid
redundant results, knowledge discovery algorithms
ideally should follow such an iterative updating
procedure. With this in mind, we introduce a
well-founded approach for succinctly summarizing data
with the most informative itemsets; using a
probabilistic maximum entropy model, we iteratively
find the itemset that provides us the most novel
information-that is, for which the frequency in the
data surprises us the most-and in turn we update our
model accordingly. As we use the maximum entropy
principle to obtain unbiased probabilistic models, and
only include those itemsets that are most informative
with regard to the current model, the summaries we
construct are guaranteed to be both descriptive and
nonredundant. The algorithm that we present, called
mtv, can either discover the top- k most informative
itemsets, or we can employ either the Bayesian
Information Criterion (bic) or the Minimum Description
Length (mdl) principle to automatically identify the
set of itemsets that together summarize the data well.
In other words, our method will ``tell you what you
need to know'' about the data. Importantly, it is a
one-phase algorithm: rather than picking itemsets from
a user-provided candidate set, itemsets and their
supports are mined on-the-fly. To further its
applicability, we provide an efficient method to
compute the maximum entropy distribution using Quick
Inclusion-Exclusion. Experiments on our method, using
synthetic, benchmark, and real data, show that the
discovered summaries are succinct, and correctly
identify the key patterns in the data. The models they
form attain high likelihoods, and inspection shows that
they summarize the data well with increasingly
specific, yet nonredundant itemsets.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chu:2012:TLM,
author = "Shumo Chu and James Cheng",
title = "Triangle listing in massive networks",
journal = j-TKDD,
volume = "6",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2382577.2382581",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:40 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Triangle listing is one of the fundamental algorithmic
problems whose solution has numerous applications
especially in the analysis of complex networks, such as
the computation of clustering coefficients,
transitivity, triangular connectivity, trusses, etc.
Existing algorithms for triangle listing are mainly
in-memory algorithms, whose performance cannot scale
with the massive volume of today's fast growing
networks. When the input graph cannot fit in main
memory, triangle listing requires random disk accesses
that can incur prohibitively huge I/O cost. Some
streaming, semistreaming, and sampling algorithms have
been proposed but these are approximation algorithms.
We propose an I/O-efficient algorithm for triangle
listing. Our algorithm is exact and avoids random disk
access. Our results show that our algorithm is scalable
and outperforms the state-of-the-art in-memory and
local triangle estimation algorithms.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chattopadhyay:2012:MDA,
author = "Rita Chattopadhyay and Qian Sun and Wei Fan and Ian
Davidson and Sethuraman Panchanathan and Jieping Ye",
title = "Multisource domain adaptation and its application to
early detection of fatigue",
journal = j-TKDD,
volume = "6",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2382577.2382582",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:40 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We consider the characterization of muscle fatigue
through a noninvasive sensing mechanism such as Surface
ElectroMyoGraphy (SEMG). While changes in the
properties of SEMG signals with respect to muscle
fatigue have been reported in the literature, the large
variation in these signals across different individuals
makes the task of modeling and classification of SEMG
signals challenging. Indeed, the variation in SEMG
parameters from subject to subject creates differences
in the data distribution. In this article, we propose
two transfer learning frameworks based on the
multisource domain adaptation methodology for detecting
different stages of fatigue using SEMG signals, that
addresses the distribution differences. In the proposed
frameworks, the SEMG data of a subject represent a
domain; data from multiple subjects in the training set
form the multiple source domains and the test subject
data form the target domain. SEMG signals are
predominantly different in conditional probability
distribution across subjects. The key feature of the
first framework is a novel weighting scheme that
addresses the conditional probability distribution
differences across multiple domains (subjects) and the
key feature of the second framework is a two-stage
domain adaptation methodology which combines weighted
data from multiple sources based on marginal
probability differences (first stage) as well as
conditional probability differences (second stage),
with the target domain data. The weights for minimizing
the marginal probability differences are estimated
independently, while the weights for minimizing
conditional probability differences are computed
simultaneously by exploiting the potential interaction
among multiple sources. We also provide a theoretical
analysis on the generalization performance of the
proposed multisource domain adaptation formulation
using the weighted Rademacher complexity measure. We
have validated the proposed frameworks on Surface
ElectroMyoGram signals collected from 8 people during a
fatigue-causing repetitive gripping activity.
Comprehensive experiments on the SEMG dataset
demonstrate that the proposed method improves the
classification accuracy by 20\% to 30\% over the cases
without any domain adaptation method and by 13\% to
30\% over existing state-of-the-art domain adaptation
methods.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wilkinson:2012:SIS,
author = "Leland Wilkinson and Anushka Anand and Tuan Nhon
Dang",
title = "Substantial improvements in the set-covering
projection classifier {CHIRP} (composite hypercubes on
iterated random projections)",
journal = j-TKDD,
volume = "6",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2012",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2382577.2382583",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:40 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In Wilkinson et al. [2011] we introduced a new
set-covering random projection classifier that achieved
average error lower than that of other classifiers in
the Weka platform. This classifier was based on an
$L^\infty$ norm distance function and exploited an
iterative sequence of three stages (projecting,
binning, and covering) to deal with the curse of
dimensionality, computational complexity, and nonlinear
separability. We now present substantial changes that
improve robustness and reduce training and testing time
by almost an order of magnitude without jeopardizing
CHIRP's outstanding error performance.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Angiulli:2013:NNB,
author = "Fabrizio Angiulli and Fabio Fassetti",
title = "Nearest Neighbor-Based Classification of Uncertain
Data",
journal = j-TKDD,
volume = "7",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2435209.2435210",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:44 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "This work deals with the problem of classifying
uncertain data. With this aim we introduce the
Uncertain Nearest Neighbor (UNN) rule, which represents
the generalization of the deterministic nearest
neighbor rule to the case in which uncertain objects
are available. The UNN rule relies on the concept of
nearest neighbor class, rather than on that of nearest
neighbor object. The nearest neighbor class of a test
object is the class that maximizes the probability of
providing its nearest neighbor. The evidence is that
the former concept is much more powerful than the
latter in the presence of uncertainty, in that it
correctly models the right semantics of the nearest
neighbor decision rule when applied to the uncertain
scenario. An effective and efficient algorithm to
perform uncertain nearest neighbor classification of a
generic (un)certain test object is designed, based on
properties that greatly reduce the temporal cost
associated with nearest neighbor class probability
computation. Experimental results are presented,
showing that the UNN rule is effective and efficient in
classifying uncertain data.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2013:CDS,
author = "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong
Gong",
title = "Comparative Document Summarization via Discriminative
Sentence Selection",
journal = j-TKDD,
volume = "7",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2435209.2435211",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:44 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Given a collection of document groups, a natural
question is to identify the differences among these
groups. Although traditional document summarization
techniques can summarize the content of the document
groups one by one, there exists a great necessity to
generate a summary of the differences among the
document groups. In this article, we study a novel
problem of summarizing the differences between document
groups. A discriminative sentence selection method is
proposed to extract the most discriminative sentences
that represent the specific characteristics of each
document group. Experiments and case studies on
real-world data sets demonstrate the effectiveness of
our proposed method.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Bayati:2013:MPA,
author = "Mohsen Bayati and David F. Gleich and Amin Saberi and
Ying Wang",
title = "Message-Passing Algorithms for Sparse Network
Alignment",
journal = j-TKDD,
volume = "7",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2435209.2435212",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:44 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Network alignment generalizes and unifies several
approaches for forming a matching or alignment between
the vertices of two graphs. We study a mathematical
programming framework for network alignment problem and
a sparse variation of it where only a small number of
matches between the vertices of the two graphs are
possible. We propose a new message passing algorithm
that allows us to compute, very efficiently,
approximate solutions to the sparse network alignment
problems with graph sizes as large as hundreds of
thousands of vertices. We also provide extensive
simulations comparing our algorithms with two of the
best solvers for network alignment problems on two
synthetic matching problems, two bioinformatics
problems, and three large ontology alignment problems
including a multilingual problem with a known labeled
alignment.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Li:2013:CWM,
author = "Bin Li and Steven C. H. Hoi and Peilin Zhao and
Vivekanand Gopalkrishnan",
title = "Confidence Weighted Mean Reversion Strategy for Online
Portfolio Selection",
journal = j-TKDD,
volume = "7",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2435209.2435213",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Mon Jun 24 13:02:44 MDT 2013",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Online portfolio selection has been attracting
increasing attention from the data mining and machine
learning communities. All existing online portfolio
selection strategies focus on the first order
information of a portfolio vector, though the second
order information may also be beneficial to a strategy.
Moreover, empirical evidence shows that relative stock
prices may follow the mean reversion property, which
has not been fully exploited by existing strategies.
This article proposes a novel online portfolio
selection strategy named Confidence Weighted Mean
Reversion (CWMR). Inspired by the mean reversion
principle in finance and confidence weighted online
learning technique in machine learning, CWMR models the
portfolio vector as a Gaussian distribution, and
sequentially updates the distribution by following the
mean reversion trading principle. CWMR's closed-form
updates clearly reflect the mean reversion trading
idea. We also present several variants of CWMR
algorithms, including a CWMR mixture algorithm that is
theoretical universal. Empirically, CWMR strategy is
able to effectively exploit the power of mean reversion
for online portfolio selection. Extensive experiments
on various real markets show that the proposed strategy
is superior to the state-of-the-art techniques. The
experimental testbed including source codes and data
sets is available online.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Lou:2013:LPR,
author = "Tiancheng Lou and Jie Tang and John Hopcroft and
Zhanpeng Fang and Xiaowen Ding",
title = "Learning to predict reciprocity and triadic closure in
social networks",
journal = j-TKDD,
volume = "7",
number = "2",
pages = "5:1--5:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2499907.2499908",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:06 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We study how links are formed in social networks. In
particular, we focus on investigating how a reciprocal
(two-way) link, the basic relationship in social
networks, is developed from a parasocial (one-way)
relationship and how the relationships further develop
into triadic closure, one of the fundamental processes
of link formation. We first investigate how geographic
distance and interactions between users influence the
formation of link structure among users. Then we study
how social theories including homophily, social
balance, and social status are satisfied over networks
with parasocial and reciprocal relationships. The study
unveils several interesting phenomena. For example,
``friend's friend is a friend'' indeed exists in the
reciprocal relationship network, but does not hold in
the parasocial relationship network. We propose a
learning framework to formulate the problems of
predicting reciprocity and triadic closure into a
graphical model. We demonstrate that it is possible to
accurately infer 90\% of reciprocal relationships in a
Twitter network. The proposed model also achieves
better performance (+20--30\% in terms of F1-measure)
than several alternative methods for predicting the
triadic closure formation.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Yang:2013:EOL,
author = "Haiqin Yang and Michael R. Lyu and Irwin King",
title = "Efficient online learning for multitask feature
selection",
journal = j-TKDD,
volume = "7",
number = "2",
pages = "6:1--6:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2499907.2499909",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:06 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Learning explanatory features across multiple related
tasks, or MultiTask Feature Selection (MTFS), is an
important problem in the applications of data mining,
machine learning, and bioinformatics. Previous MTFS
methods fulfill this task by batch-mode training. This
makes them inefficient when data come sequentially or
when the number of training data is so large that they
cannot be loaded into the memory simultaneously. In
order to tackle these problems, we propose a novel
online learning framework to solve the MTFS problem. A
main advantage of the online algorithm is its
efficiency in both time complexity and memory cost. The
weights of the MTFS models at each iteration can be
updated by closed-form solutions based on the average
of previous subgradients. This yields the worst-case
bounds of the time complexity and memory cost at each
iteration, both in the order of O ( d $ \times $ Q ),
where d is the number of feature dimensions and Q is
the number of tasks. Moreover, we provide theoretical
analysis for the average regret of the online learning
algorithms, which also guarantees the convergence rate
of the algorithms. Finally, we conduct detailed
experiments to show the characteristics and merits of
the online learning algorithms in solving several MTFS
problems.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhang:2013:MRL,
author = "Yu Zhang and Dit-Yan Yeung",
title = "Multilabel relationship learning",
journal = j-TKDD,
volume = "7",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2499907.2499910",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:06 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Multilabel learning problems are commonly found in
many applications. A characteristic shared by many
multilabel learning problems is that some labels have
significant correlations between them. In this article,
we propose a novel multilabel learning method, called
MultiLabel Relationship Learning (MLRL), which extends
the conventional support vector machine by explicitly
learning and utilizing the relationships between
labels. Specifically, we model the label relationships
using a label covariance matrix and use it to define a
new regularization term for the optimization problem.
MLRL learns the model parameters and the label
covariance matrix simultaneously based on a unified
convex formulation. To solve the convex optimization
problem, we use an alternating method in which each
subproblem can be solved efficiently. The relationship
between MLRL and two widely used maximum margin methods
for multilabel learning is investigated. Moreover, we
also propose a semisupervised extension of MLRL, called
SSMLRL, to demonstrate how to make use of unlabeled
data to help learn the label covariance matrix. Through
experiments conducted on some multilabel applications,
we find that MLRL not only gives higher classification
accuracy but also has better interpretability as
revealed by the label covariance matrix.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Peng:2013:EFF,
author = "Jing Peng and Guna Seetharaman and Wei Fan and Aparna
Varde",
title = "Exploiting {Fisher} and {Fukunaga--Koontz} transforms
in {Chernoff} dimensionality reduction",
journal = j-TKDD,
volume = "7",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2499907.2499911",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:06 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Knowledge discovery from big data demands effective
representation of data. However, big data are often
characterized by high dimensionality, which makes
knowledge discovery more difficult. Many techniques for
dimensionality reduction have been proposed, including
well-known Fisher's Linear Discriminant Analysis (LDA).
However, the Fisher criterion is incapable of dealing
with heteroscedasticity in the data. A technique based
on the Chernoff criterion for linear dimensionality
reduction has been proposed that is capable of
exploiting heteroscedastic information in the data.
While the Chernoff criterion has been shown to
outperform the Fisher's, a clear understanding of its
exact behavior is lacking. In this article, we show
precisely what can be expected from the Chernoff
criterion. In particular, we show that the Chernoff
criterion exploits the Fisher and Fukunaga-Koontz
transforms in computing its linear discriminants.
Furthermore, we show that a recently proposed
decomposition of the data space into four subspaces is
incomplete. We provide arguments on how to best enrich
the decomposition of the data space in order to account
for heteroscedasticity in the data. Finally, we provide
experimental results validating our theoretical
analysis.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Agarwal:2013:ISI,
author = "Deepak Agarwal and Rich Caruana and Jian Pei and Ke
Wang",
title = "Introduction to the {Special Issue ACM SIGKDD 2012}",
journal = j-TKDD,
volume = "7",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2513092.2513093",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:07 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Rakthanmanon:2013:ABD,
author = "Thanawin Rakthanmanon and Bilson Campana and Abdullah
Mueen and Gustavo Batista and Brandon Westover and
Qiang Zhu and Jesin Zakaria and Eamonn Keogh",
title = "Addressing Big Data Time Series: Mining Trillions of
Time Series Subsequences Under Dynamic Time Warping",
journal = j-TKDD,
volume = "7",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2500489",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:07 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Most time series data mining algorithms use similarity
search as a core subroutine, and thus the time taken
for similarity search is the bottleneck for virtually
all time series data mining algorithms, including
classification, clustering, motif discovery, anomaly
detection, and so on. The difficulty of scaling a
search to large datasets explains to a great extent why
most academic work on time series data mining has
plateaued at considering a few millions of time series
objects, while much of industry and science sits on
billions of time series objects waiting to be explored.
In this work we show that by using a combination of
four novel ideas we can search and mine massive time
series for the first time. We demonstrate the following
unintuitive fact: in large datasets we can exactly
search under Dynamic Time Warping (DTW) much more
quickly than the current state-of-the-art Euclidean
distance search algorithms. We demonstrate our work on
the largest set of time series experiments ever
attempted. In particular, the largest dataset we
consider is larger than the combined size of all of the
time series datasets considered in all data mining
papers ever published. We explain how our ideas allow
us to solve higher-level time series data mining
problems such as motif discovery and clustering at
scales that would otherwise be untenable. Moreover, we
show how our ideas allow us to efficiently support the
uniform scaling distance measure, a measure whose
utility seems to be underappreciated, but which we
demonstrate here. In addition to mining massive
datasets with up to one trillion datapoints, we will
show that our ideas also have implications for
real-time monitoring of data streams, allowing us to
handle much faster arrival rates and/or use cheaper and
lower powered devices than are currently possible.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Sun:2013:PIM,
author = "Yizhou Sun and Brandon Norick and Jiawei Han and
Xifeng Yan and Philip S. Yu and Xiao Yu",
title = "{PathSelClus}: Integrating Meta-Path Selection with
User-Guided Object Clustering in Heterogeneous
Information Networks",
journal = j-TKDD,
volume = "7",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2500492",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:07 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Real-world, multiple-typed objects are often
interconnected, forming heterogeneous information
networks. A major challenge for link-based clustering
in such networks is their potential to generate many
different results, carrying rather diverse semantic
meanings. In order to generate desired clustering, we
propose to use meta-path, a path that connects object
types via a sequence of relations, to control
clustering with distinct semantics. Nevertheless, it is
easier for a user to provide a few examples (seeds)
than a weighted combination of sophisticated meta-paths
to specify her clustering preference. Thus, we propose
to integrate meta-path selection with user-guided
clustering to cluster objects in networks, where a user
first provides a small set of object seeds for each
cluster as guidance. Then the system learns the weight
for each meta-path that is consistent with the
clustering result implied by the guidance, and
generates clusters under the learned weights of
meta-paths. A probabilistic approach is proposed to
solve the problem, and an effective and efficient
iterative algorithm, PathSelClus, is proposed to learn
the model, where the clustering quality and the
meta-path weights mutually enhance each other. Our
experiments with several clustering tasks in two real
networks and one synthetic network demonstrate the
power of the algorithm in comparison with the
baselines.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Bellare:2013:ASE,
author = "Kedar Bellare and Suresh Iyengar and Aditya
Parameswaran and Vibhor Rastogi",
title = "Active Sampling for Entity Matching with Guarantees",
journal = j-TKDD,
volume = "7",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2500490",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:07 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In entity matching, a fundamental issue while training
a classifier to label pairs of entities as either
duplicates or nonduplicates is the one of selecting
informative training examples. Although active learning
presents an attractive solution to this problem,
previous approaches minimize the misclassification rate
(0--1 loss) of the classifier, which is an unsuitable
metric for entity matching due to class imbalance
(i.e., many more nonduplicate pairs than duplicate
pairs). To address this, a recent paper [Arasu et al.
2010] proposes to maximize recall of the classifier
under the constraint that its precision should be
greater than a specified threshold. However, the
proposed technique requires the labels of all n input
pairs in the worst case. Our main result is an active
learning algorithm that approximately maximizes recall
of the classifier while respecting a precision
constraint with provably sublinear label complexity
(under certain distributional assumptions). Our
algorithm uses as a black box any active learning
module that minimizes 0--1 loss. We show that label
complexity of our algorithm is at most log n times the
label complexity of the black box, and also bound the
difference in the recall of classifier learnt by our
algorithm and the recall of the optimal classifier
satisfying the precision constraint. We provide an
empirical evaluation of our algorithm on several
real-world matching data sets that demonstrates the
effectiveness of our approach.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chattopadhyay:2013:BMA,
author = "Rita Chattopadhyay and Zheng Wang and Wei Fan and Ian
Davidson and Sethuraman Panchanathan and Jieping Ye",
title = "Batch Mode Active Sampling Based on Marginal
Probability Distribution Matching",
journal = j-TKDD,
volume = "7",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2513092.2513094",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:07 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Active Learning is a machine learning and data mining
technique that selects the most informative samples for
labeling and uses them as training data; it is
especially useful when there are large amount of
unlabeled data and labeling them is expensive.
Recently, batch-mode active learning, where a set of
samples are selected concurrently for labeling, based
on their collective merit, has attracted a lot of
attention. The objective of batch-mode active learning
is to select a set of informative samples so that a
classifier learned on these samples has good
generalization performance on the unlabeled data. Most
of the existing batch-mode active learning
methodologies try to achieve this by selecting samples
based on certain criteria. In this article we propose a
novel criterion which achieves good generalization
performance of a classifier by specifically selecting a
set of query samples that minimize the difference in
distribution between the labeled and the unlabeled
data, after annotation. We explicitly measure this
difference based on all candidate subsets of the
unlabeled data and select the best subset. The proposed
objective is an NP-hard integer programming
optimization problem. We provide two optimization
techniques to solve this problem. In the first one, the
problem is transformed into a convex quadratic
programming problem and in the second method the
problem is transformed into a linear programming
problem. Our empirical studies using publicly available
UCI datasets and two biomedical image databases
demonstrate the effectiveness of the proposed approach
in comparison with the state-of-the-art batch-mode
active learning methods. We also present two extensions
of the proposed approach, which incorporate uncertainty
of the predicted labels of the unlabeled data and
transfer learning in the proposed formulation. In
addition, we present a joint optimization framework for
performing both transfer and active learning
simultaneously unlike the existing approaches of
learning in two separate stages, that is, typically,
transfer learning followed by active learning. We
specifically minimize a common objective of reducing
distribution difference between the domain adapted
source, the queried and labeled samples and the rest of
the unlabeled target domain data. Our empirical studies
on two biomedical image databases and on a publicly
available 20 Newsgroups dataset show that incorporation
of uncertainty information and transfer learning
further improves the performance of the proposed active
learning based classifier. Our empirical studies also
show that the proposed transfer-active method based on
the joint optimization framework performs significantly
better than a framework which implements transfer and
active learning in two separate stages.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Briggs:2013:IAM,
author = "Forrest Briggs and Xiaoli Z. Fern and Raviv Raich and
Qi Lou",
title = "Instance Annotation for Multi-Instance Multi-Label
Learning",
journal = j-TKDD,
volume = "7",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2500491",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:07 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Multi-instance multi-label learning (MIML) is a
framework for supervised classification where the
objects to be classified are bags of instances
associated with multiple labels. For example, an image
can be represented as a bag of segments and associated
with a list of objects it contains. Prior work on MIML
has focused on predicting label sets for previously
unseen bags. We instead consider the problem of
predicting instance labels while learning from data
labeled only at the bag level. We propose a regularized
rank-loss objective designed for instance annotation,
which can be instantiated with different aggregation
models connecting instance-level labels with bag-level
label sets. The aggregation models that we consider can
be factored as a linear function of a ``support
instance'' for each class, which is a single feature
vector representing a whole bag. Hence we name our
proposed methods rank-loss Support Instance Machines
(SIM). We propose two optimization methods for the
rank-loss objective, which is nonconvex. One is a
heuristic method that alternates between updating
support instances, and solving a convex problem in
which the support instances are treated as constant.
The other is to apply the constrained concave-convex
procedure (CCCP), which can also be interpreted as
iteratively updating support instances and solving a
convex problem. To solve the convex problem, we employ
the Pegasos framework of primal subgradient descent,
and prove that it finds an $ \epsilon $-suboptimal
solution in runtime that is linear in the number of
bags, instances, and $ 1 / \epsilon $. Additionally, we
suggest a method of extending the linear learning
algorithm to nonlinear classification, without
increasing the runtime asymptotically. Experiments on
artificial and real-world datasets including images and
audio show that the proposed methods achieve higher
accuracy than other loss functions used in prior work,
e.g., Hamming loss, and recent work in ambiguous label
classification.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ji:2013:PFR,
author = "Ming Ji and Binbin Lin and Xiaofei He and Deng Cai and
Jiawei Han",
title = "Parallel Field Ranking",
journal = j-TKDD,
volume = "7",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2513092.2513096",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:07 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Recently, ranking data with respect to the intrinsic
geometric structure (manifold ranking) has received
considerable attentions, with encouraging performance
in many applications in pattern recognition,
information retrieval and recommendation systems. Most
of the existing manifold ranking methods focus on
learning a ranking function that varies smoothly along
the data manifold. However, beyond smoothness, a
desirable ranking function should vary monotonically
along the geodesics of the data manifold, such that the
ranking order along the geodesics is preserved. In this
article, we aim to learn a ranking function that varies
linearly and therefore monotonically along the
geodesics of the data manifold. Recent theoretical work
shows that the gradient field of a linear function on
the manifold has to be a parallel vector field.
Therefore, we propose a novel ranking algorithm on the
data manifolds, called Parallel Field Ranking.
Specifically, we try to learn a ranking function and a
vector field simultaneously. We require the vector
field to be close to the gradient field of the ranking
function, and the vector field to be as parallel as
possible. Moreover, we require the value of the ranking
function at the query point to be the highest, and then
decrease linearly along the manifold. Experimental
results on both synthetic data and real data
demonstrate the effectiveness of our proposed
algorithm.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Adali:2013:IPR,
author = "Sibel Adali and Malik Magdon-Ismail and Xiaohui Lu",
title = "{iHypR}: Prominence ranking in networks of
collaborations with hyperedges 1",
journal = j-TKDD,
volume = "7",
number = "4",
pages = "16:1--16:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2541268.2541269",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:09 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We present a new algorithm called iHypR for computing
prominence of actors in social networks of
collaborations. Our algorithm builds on the assumption
that prominent actors collaborate on prominent objects,
and prominent objects are naturally grouped into
prominent clusters or groups (hyperedges in a graph).
iHypR makes use of the relationships between actors,
objects, and hyperedges to compute a global prominence
score for the actors in the network. We do not assume
the hyperedges are given in advance. Hyperedges
computed by our method can perform as well or even
better than ``true'' hyperedges. Our algorithm is
customized for networks of collaborations, but it is
generally applicable without further tuning. We show,
through extensive experimentation with three real-life
data sets and multiple external measures of prominence,
that our algorithm outperforms existing well-known
algorithms. Our work is the first to offer such an
extensive evaluation. We show that unlike most existing
algorithms, the performance is robust across multiple
measures of performance. Further, we give a detailed
study of the sensitivity of our algorithm to different
data sets and the design choices within the algorithm
that a user may wish to change. Our article illustrates
the various trade-offs that must be considered in
computing prominence in collaborative social
networks.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Huang:2013:STP,
author = "Jin Huang and Feiping Nie and Heng Huang and Yi-Cheng
Tu and Yu Lei",
title = "Social trust prediction using heterogeneous networks",
journal = j-TKDD,
volume = "7",
number = "4",
pages = "17:1--17:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2541268.2541270",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:09 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Along with increasing popularity of social websites,
online users rely more on the trustworthiness
information to make decisions, extract and filter
information, and tag and build connections with other
users. However, such social network data often suffer
from severe data sparsity and are not able to provide
users with enough information. Therefore, trust
prediction has emerged as an important topic in social
network research. Traditional approaches are primarily
based on exploring trust graph topology itself.
However, research in sociology and our life experience
suggest that people who are in the same social circle
often exhibit similar behaviors and tastes. To take
advantage of the ancillary information for trust
prediction, the challenge then becomes what to transfer
and how to transfer. In this article, we address this
problem by aggregating heterogeneous social networks
and propose a novel joint social networks mining (JSNM)
method. Our new joint learning model explores the
user-group-level similarity between correlated graphs
and simultaneously learns the individual graph
structure; therefore, the shared structures and
patterns from multiple social networks can be utilized
to enhance the prediction tasks. As a result, we not
only improve the trust prediction in the target graph
but also facilitate other information retrieval tasks
in the auxiliary graphs. To optimize the proposed
objective function, we use the alternative technique to
break down the objective function into several
manageable subproblems. We further introduce the
auxiliary function to solve the optimization problems
with rigorously proved convergence. The extensive
experiments have been conducted on both synthetic and
real- world data. All empirical results demonstrate the
effectiveness of our method.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Guzzo:2013:SIF,
author = "Antonella Guzzo and Luigi Moccia and Domenico
Sacc{\`a} and Edoardo Serra",
title = "Solving inverse frequent itemset mining with
infrequency constraints via large-scale linear
programs",
journal = j-TKDD,
volume = "7",
number = "4",
pages = "18:1--18:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2541268.2541271",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:09 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Inverse frequent set mining (IFM) is the problem of
computing a transaction database D satisfying given
support constraints for some itemsets, which are
typically the frequent ones. This article proposes a
new formulation of IFM, called IFM$_I$ (IFM with
infrequency constraints), where the itemsets that are
not listed as frequent are constrained to be
infrequent; that is, they must have a support less than
or equal to a specified unique threshold. An instance
of IFM$_I$ can be seen as an instance of the original
IFM by making explicit the infrequency constraints for
the minimal infrequent itemsets, corresponding to the
so-called negative generator border defined in the
literature. The complexity increase from PSPACE
(complexity of IFM) to NEXP (complexity of IFM$_I$) is
caused by the cardinality of the negative generator
border, which can be exponential in the original input
size. Therefore, the article introduces a specific
problem parameter $ \kappa $ that computes an upper
bound to this cardinality using a hypergraph
interpretation for which minimal infrequent itemsets
correspond to minimal transversals. By fixing a
constant k, the article formulates a $k$-bounded
definition of the problem, called $k$-IFM$_I$, that
collects all instances for which the value of the
parameter $ \kappa $ is less than or equal to $k$-its
complexity is in PSPACE as for IFM. The bounded problem
is encoded as an integer linear program with a large
number of variables (actually exponential w.r.t. the
number of constraints), which is thereafter
approximated by relaxing integer constraints-the
decision problem of solving the linear program is
proven to be in NP. In order to solve the linear
program, a column generation technique is used that is
a variation of the simplex method designed to solve
large-scale linear programs, in particular with a huge
number of variables. The method at each step requires
the solution of an auxiliary integer linear program,
which is proven to be NP hard in this case and for
which a greedy heuristic is presented. The resulting
overall column generation solution algorithm enjoys
very good scaling as evidenced by the intensive
experimentation, thereby paving the way for its
application in real-life scenarios.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Balcazar:2013:FCP,
author = "Jos{\'e} L. Balc{\'a}zar",
title = "Formal and computational properties of the confidence
boost of association rules",
journal = j-TKDD,
volume = "7",
number = "4",
pages = "19:1--19:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2541268.2541272",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:09 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Some existing notions of redundancy among association
rules allow for a logical-style characterization and
lead to irredundant bases of absolutely minimum size.
We push the intuition of redundancy further to find an
intuitive notion of novelty of an association rule,
with respect to other rules. Namely, an irredundant
rule is so because its confidence is higher than what
the rest of the rules would suggest; then, one can ask:
how much higher? We propose to measure such a sort of
novelty through the confidence boost of a rule. Acting
as a complement to confidence and support, the
confidence boost helps to obtain small and crisp sets
of mined association rules and solves the well-known
problem that, in certain cases, rules of negative
correlation may pass the confidence bound. We analyze
the properties of two versions of the notion of
confidence boost, one of them a natural generalization
of the other. We develop algorithms to filter rules
according to their confidence boost, compare the
concept to some similar notions in the literature, and
describe the results of some experimentation employing
the new notions on standard benchmark datasets. We
describe an open source association mining tool that
embodies one of our variants of confidence boost in
such a way that the data mining process does not
require the user to select any value for any
parameter.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ang:2013:CPN,
author = "Hock Hee Ang and Vivekanand Gopalkrishnan and Steven
C. H. Hoi and Wee Keong Ng",
title = "Classification in {P2P} networks with cascade support
vector machines",
journal = j-TKDD,
volume = "7",
number = "4",
pages = "20:1--20:??",
month = nov,
year = "2013",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2541268.2541273",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:09 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Classification in Peer-to-Peer (P2P) networks is
important to many real applications, such as
distributed intrusion detection, distributed
recommendation systems, and distributed antispam
detection. However, it is very challenging to perform
classification in P2P networks due to many practical
issues, such as scalability, peer dynamism, and
asynchronism. This article investigates the practical
techniques of constructing Support Vector Machine (SVM)
classifiers in the P2P networks. In particular, we
demonstrate how to efficiently cascade SVM in a P2P
network with the use of reduced SVM. In addition, we
propose to fuse the concept of cascade SVM with
bootstrap aggregation to effectively balance the
trade-off between classification accuracy, model
construction, and prediction cost. We provide
theoretical insights for the proposed solutions and
conduct an extensive set of empirical studies on a
number of large-scale datasets. Encouraging results
validate the efficacy of the proposed approach.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Chen:2014:ISI,
author = "Wei Chen and Jie Tang",
title = "Introduction to special issue on computational aspects
of social and information networks: Theory,
methodologies, and applications {(TKDD-CASIN)}",
journal = j-TKDD,
volume = "8",
number = "1",
pages = "1:1--1:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2556608",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:11 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Yang:2014:USN,
author = "Zhi Yang and Christo Wilson and Xiao Wang and Tingting
Gao and Ben Y. Zhao and Yafei Dai",
title = "Uncovering social network {Sybils} in the wild",
journal = j-TKDD,
volume = "8",
number = "1",
pages = "2:1--2:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2556609",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:11 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Sybil accounts are fake identities created to unfairly
increase the power or resources of a single malicious
user. Researchers have long known about the existence
of Sybil accounts in online communities such as
file-sharing systems, but they have not been able to
perform large-scale measurements to detect them or
measure their activities. In this article, we describe
our efforts to detect, characterize, and understand
Sybil account activity in the Renren Online Social
Network (OSN). We use ground truth provided by Renren
Inc. to build measurement-based Sybil detectors and
deploy them on Renren to detect more than 100,000 Sybil
accounts. Using our full dataset of 650,000 Sybils, we
examine several aspects of Sybil behavior. First, we
study their link creation behavior and find that
contrary to prior conjecture, Sybils in OSNs do not
form tight-knit communities. Next, we examine the
fine-grained behaviors of Sybils on Renren using
clickstream data. Third, we investigate
behind-the-scenes collusion between large groups of
Sybils. Our results reveal that Sybils with no explicit
social ties still act in concert to launch attacks.
Finally, we investigate enhanced techniques to identify
stealthy Sybils. In summary, our study advances the
understanding of Sybil behavior on OSNs and shows that
Sybils can effectively avoid existing community-based
Sybil detectors. We hope that our results will foster
new research on Sybil detection that is based on novel
types of Sybil features.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Jin:2014:SAR,
author = "Ruoming Jin and Victor E. Lee and Longjie Li",
title = "Scalable and axiomatic ranking of network role
similarity",
journal = j-TKDD,
volume = "8",
number = "1",
pages = "3:1--3:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2518176",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:11 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "A key task in analyzing social networks and other
complex networks is role analysis: describing and
categorizing nodes according to how they interact with
other nodes. Two nodes have the same role if they
interact with equivalent sets of neighbors. The most
fundamental role equivalence is automorphic
equivalence. Unfortunately, the fastest algorithms
known for graph automorphism are nonpolynomial.
Moreover, since exact equivalence is rare, a more
meaningful task is measuring the role similarity
between any two nodes. This task is closely related to
the structural or link-based similarity problem that
SimRank addresses. However, SimRank and other existing
similarity measures are not sufficient because they do
not guarantee to recognize automorphically or
structurally equivalent nodes. This article makes two
contributions. First, we present and justify several
axiomatic properties necessary for a role similarity
measure or metric. Second, we present RoleSim, a new
similarity metric that satisfies these axioms and can
be computed with a simple iterative algorithm. We
rigorously prove that RoleSim satisfies all of these
axiomatic properties. We also introduce Iceberg
RoleSim, a scalable algorithm that discovers all pairs
with RoleSim scores above a user-defined threshold $
\theta $. We demonstrate the interpretative power of
RoleSim on both synthetic and real datasets.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Mcauley:2014:DSC,
author = "Julian Mcauley and Jure Leskovec",
title = "Discovering social circles in ego networks",
journal = j-TKDD,
volume = "8",
number = "1",
pages = "4:1--4:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2556612",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:11 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "People's personal social networks are big and
cluttered, and currently there is no good way to
automatically organize them. Social networking sites
allow users to manually categorize their friends into
social circles (e.g., ``circles'' on Google+, and
``lists'' on Facebook and Twitter). However, circles
are laborious to construct and must be manually updated
whenever a user's network grows. In this article, we
study the novel task of automatically identifying
users' social circles. We pose this task as a
multimembership node clustering problem on a user's ego
network, a network of connections between her friends.
We develop a model for detecting circles that combines
network structure as well as user profile information.
For each circle, we learn its members and the
circle-specific user profile similarity metric.
Modeling node membership to multiple circles allows us
to detect overlapping as well as hierarchically nested
circles. Experiments show that our model accurately
identifies circles on a diverse set of data from
Facebook, Google+, and Twitter, for all of which we
obtain hand-labeled ground truth.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Abrahao:2014:SFA,
author = "Bruno Abrahao and Sucheta Soundarajan and John
Hopcroft and Robert Kleinberg",
title = "A separability framework for analyzing community
structure",
journal = j-TKDD,
volume = "8",
number = "1",
pages = "5:1--5:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2527231",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:11 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Four major factors govern the intricacies of community
extraction in networks: (1) the literature offers a
multitude of disparate community detection algorithms
whose output exhibits high structural variability
across the collection, (2) communities identified by
algorithms may differ structurally from real
communities that arise in practice, (3) there is no
consensus characterizing how to discriminate
communities from noncommunities, and (4) the
application domain includes a wide variety of networks
of fundamentally different natures. In this article, we
present a class separability framework to tackle these
challenges through a comprehensive analysis of
community properties. Our approach enables the
assessment of the structural dissimilarity among the
output of multiple community detection algorithms and
between the output of algorithms and communities that
arise in practice. In addition, our method provides us
with a way to organize the vast collection of community
detection algorithms by grouping those that behave
similarly. Finally, we identify the most discriminative
graph-theoretical properties of community signature and
the small subset of properties that account for most of
the biases of the different community detection
algorithms. We illustrate our approach with an
experimental analysis, which reveals nuances of the
structure of real and extracted communities. In our
experiments, we furnish our framework with the output
of 10 different community detection procedures,
representative of categories of popular algorithms
available in the literature, applied to a diverse
collection of large-scale real network datasets whose
domains span biology, online shopping, and social
systems. We also analyze communities identified by
annotations that accompany the data, which reflect
exemplar communities in various domain. We characterize
these communities using a broad spectrum of community
properties to produce the different structural classes.
As our experiments show that community structure is not
a universal concept, our framework enables an informed
choice of the most suitable community detection method
for identifying communities of a specific type in a
given network and allows for a comparison of existing
community detection algorithms while guiding the design
of new ones.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhong:2014:UBL,
author = "Erheng Zhong and Wei Fan and Qiang Yang",
title = "User behavior learning and transfer in composite
social networks",
journal = j-TKDD,
volume = "8",
number = "1",
pages = "6:1--6:??",
month = feb,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2556613",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Mar 13 09:16:11 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Accurate prediction of user behaviors is important for
many social media applications, including social
marketing, personalization, and recommendation. A major
challenge lies in that although many previous works
model user behavior from only historical behavior logs,
the available user behavior data or interactions
between users and items in a given social network are
usually very limited and sparse (e.g., $ \geq 99.9 \% $
empty), which makes models overfit the rare
observations and fail to provide accurate predictions.
We observe that many people are members of several
social networks in the same time, such as Facebook,
Twitter, and Tencent's QQ. Importantly, users'
behaviors and interests in different networks influence
one another. This provides an opportunity to leverage
the knowledge of user behaviors in different networks
by considering the overlapping users in different
networks as bridges, in order to alleviate the data
sparsity problem, and enhance the predictive
performance of user behavior modeling. Combining
different networks ``simply and naively'' does not work
well. In this article, we formulate the problem to
model multiple networks as ``adaptive composite
transfer'' and propose a framework called ComSoc.
ComSoc first selects the most suitable networks inside
a composite social network via a hierarchical Bayesian
model, parameterized for individual users. It then
builds topic models for user behavior prediction using
both the relationships in the selected networks and
related behavior data. With different relational
regularization, we introduce different implementations,
corresponding to different ways to transfer knowledge
from composite social relations. To handle big data, we
have implemented the algorithm using Map/Reduce. We
demonstrate that the proposed composite network-based
user behavior models significantly improve the
predictive accuracy over a number of existing
approaches on several real-world applications,
including a very large social networking dataset from
Tencent Inc.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ahmed:2014:NSS,
author = "Nesreen K. Ahmed and Jennifer Neville and Ramana
Kompella",
title = "Network Sampling: From Static to Streaming Graphs",
journal = j-TKDD,
volume = "8",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2601438",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Jun 26 05:48:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Network sampling is integral to the analysis of
social, information, and biological networks. Since
many real-world networks are massive in size,
continuously evolving, and/or distributed in nature,
the network structure is often sampled in order to
facilitate study. For these reasons, a more thorough
and complete understanding of network sampling is
critical to support the field of network science. In
this paper, we outline a framework for the general
problem of network sampling by highlighting the
different objectives, population and units of interest,
and classes of network sampling methods. In addition,
we propose a spectrum of computational models for
network sampling methods, ranging from the
traditionally studied model based on the assumption of
a static domain to a more challenging model that is
appropriate for streaming domains. We design a family
of sampling methods based on the concept of graph
induction that generalize across the full spectrum of
computational models (from static to streaming) while
efficiently preserving many of the topological
properties of the input graphs. Furthermore, we
demonstrate how traditional static sampling algorithms
can be modified for graph streams for each of the three
main classes of sampling methods: node, edge, and
topology-based sampling. Experimental results indicate
that our proposed family of sampling methods more
accurately preserve the underlying properties of the
graph in both static and streaming domains. Finally, we
study the impact of network sampling algorithms on the
parameter estimation and performance evaluation of
relational classification algorithms.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ge:2014:RMA,
author = "Yong Ge and Guofei Jiang and Min Ding and Hui Xiong",
title = "Ranking Metric Anomaly in Invariant Networks",
journal = j-TKDD,
volume = "8",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2601436",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Jun 26 05:48:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The management of large-scale distributed information
systems relies on the effective use and modeling of
monitoring data collected at various points in the
distributed information systems. A traditional approach
to model monitoring data is to discover invariant
relationships among the monitoring data. Indeed, we can
discover all invariant relationships among all pairs of
monitoring data and generate invariant networks, where
a node is a monitoring data source (metric) and a link
indicates an invariant relationship between two
monitoring data. Such an invariant network
representation can help system experts to localize and
diagnose the system faults by examining those broken
invariant relationships and their related metrics,
since system faults usually propagate among the
monitoring data and eventually lead to some broken
invariant relationships. However, at one time, there
are usually a lot of broken links (invariant
relationships) within an invariant network. Without
proper guidance, it is difficult for system experts to
manually inspect this large number of broken links. To
this end, in this article, we propose the problem of
ranking metrics according to the anomaly levels for a
given invariant network, while this is a nontrivial
task due to the uncertainties and the complex nature of
invariant networks. Specifically, we propose two types
of algorithms for ranking metric anomaly by link
analysis in invariant networks. Along this line, we
first define two measurements to quantify the anomaly
level of each metric, and introduce the m Rank
algorithm. Also, we provide a weighted score mechanism
and develop the g Rank algorithm, which involves an
iterative process to obtain a score to measure the
anomaly levels. In addition, some extended algorithms
based on m Rank and g Rank algorithms are developed
by taking into account the probability of being broken
as well as noisy links. Finally, we validate all the
proposed algorithms on a large number of real-world and
synthetic data sets to illustrate the effectiveness and
efficiency of different algorithms.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhang:2014:DGP,
author = "Gensheng Zhang and Xiao Jiang and Ping Luo and Min
Wang and Chengkai Li",
title = "Discovering General Prominent Streaks in Sequence
Data",
journal = j-TKDD,
volume = "8",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2601439",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Jun 26 05:48:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "This article studies the problem of prominent streak
discovery in sequence data. Given a sequence of values,
a prominent streak is a long consecutive subsequence
consisting of only large (small) values, such as
consecutive games of outstanding performance in sports,
consecutive hours of heavy network traffic, and
consecutive days of frequent mentioning of a person in
social media. Prominent streak discovery provides
insightful data patterns for data analysis in many
real-world applications and is an enabling technique
for computational journalism. Given its real-world
usefulness and complexity, the research on prominent
streaks in sequence data opens a spectrum of
challenging problems. A baseline approach to finding
prominent streaks is a quadratic algorithm that
exhaustively enumerates all possible streaks and
performs pairwise streak dominance comparison. For more
efficient methods, we make the observation that
prominent streaks are in fact skyline points in two
dimensions-streak interval length and minimum value in
the interval. Our solution thus hinges on the idea to
separate the two steps in prominent streak discovery:
candidate streak generation and skyline operation over
candidate streaks. For candidate generation, we propose
the concept of local prominent streak (LPS). We prove
that prominent streaks are a subset of LPSs and the
number of LPSs is less than the length of a data
sequence, in comparison with the quadratic number of
candidates produced by the brute-force baseline method.
We develop efficient algorithms based on the concept of
LPS. The nonlinear local prominent streak (NLPS)-based
method considers a superset of LPSs as candidates, and
the linear local prominent streak (LLPS)-based method
further guarantees to consider only LPSs. The proposed
properties and algorithms are also extended for
discovering general top- k, multisequence, and
multidimensional prominent streaks. The results of
experiments using multiple real datasets verified the
effectiveness of the proposed methods and showed orders
of magnitude performance improvement against the
baseline method.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Schifanella:2014:MTD,
author = "Claudio Schifanella and K. Sel{\c{c}}uk Candan and
Maria Luisa Sapino",
title = "Multiresolution Tensor Decompositions with Mode
Hierarchies",
journal = j-TKDD,
volume = "8",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2532169",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Jun 26 05:48:22 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Tensors (multidimensional arrays) are widely used for
representing high-order dimensional data, in
applications ranging from social networks, sensor data,
and Internet traffic. Multiway data analysis
techniques, in particular tensor decompositions, allow
extraction of hidden correlations among multiway data
and thus are key components of many data analysis
frameworks. Intuitively, these algorithms can be
thought of as multiway clustering schemes, which
consider multiple facets of the data in identifying
clusters, their weights, and contributions of each data
element. Unfortunately, algorithms for fitting multiway
models are, in general, iterative and very time
consuming. In this article, we observe that, in many
applications, there is a priori background knowledge
(or metadata) about one or more domain dimensions. This
metadata is often in the form of a hierarchy that
clusters the elements of a given data facet (or mode).
We investigate whether such single-mode data
hierarchies can be used to boost the efficiency of
tensor decomposition process, without significant
impact on the final decomposition quality. We consider
each domain hierarchy as a guide to help provide
higher- or lower-resolution views of the data in the
tensor on demand and we rely on these metadata-induced
multiresolution tensor representations to develop a
multiresolution approach to tensor decomposition. In
this article, we focus on an alternating least squares
(ALS)--based implementation of the two most important
decomposition models such as the PARAllel FACtors
(PARAFAC, which decomposes a tensor into a diagonal
tensor and a set of factor matrices) and the Tucker
(which produces as result a core tensor and a set of
dimension-subspaces matrices). Experiment results show
that, when the available metadata is used as a rough
guide, the proposed multiresolution method helps fit
both PARAFAC and Tucker models with consistent (under
different parameters settings) savings in execution
time and memory consumption, while preserving the
quality of the decomposition.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Huang:2014:RMN,
author = "Jin Huang and Feiping Nie and Heng Huang and Chris
Ding",
title = "Robust Manifold Nonnegative Matrix Factorization",
journal = j-TKDD,
volume = "8",
number = "3",
pages = "11:1--11:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2601434",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jun 3 13:50:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Nonnegative Matrix Factorization (NMF) has been one of
the most widely used clustering techniques for
exploratory data analysis. However, since each data
point enters the objective function with squared
residue error, a few outliers with large errors easily
dominate the objective function. In this article, we
propose a Robust Manifold Nonnegative Matrix
Factorization (RMNMF) method using l$_{2, 1}$ -norm and
integrating NMF and spectral clustering under the same
clustering framework. We also point out the solution
uniqueness issue for the existing NMF methods and
propose an additional orthonormal constraint to address
this problem. With the new constraint, the conventional
auxiliary function approach no longer works. We tackle
this difficult optimization problem via a novel
Augmented Lagrangian Method (ALM)--based algorithm and
convert the original constrained optimization problem
on one variable into a multivariate constrained
problem. The new objective function then can be
decomposed into several subproblems that each has a
closed-form solution. More importantly, we reveal the
connection of our method with robust K -means and
spectral clustering, and we demonstrate its theoretical
significance. Extensive experiments have been conducted
on nine benchmark datasets, and all empirical results
show the effectiveness of our method.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhang:2014:RAL,
author = "Yu Zhang and Dit-Yan Yeung",
title = "A Regularization Approach to Learning Task
Relationships in Multitask Learning",
journal = j-TKDD,
volume = "8",
number = "3",
pages = "12:1--12:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2538028",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jun 3 13:50:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Multitask learning is a learning paradigm that seeks
to improve the generalization performance of a learning
task with the help of some other related tasks. In this
article, we propose a regularization approach to
learning the relationships between tasks in multitask
learning. This approach can be viewed as a novel
generalization of the regularized formulation for
single-task learning. Besides modeling positive task
correlation, our approach-multitask relationship
learning (MTRL)-can also describe negative task
correlation and identify outlier tasks based on the
same underlying principle. By utilizing a
matrix-variate normal distribution as a prior on the
model parameters of all tasks, our MTRL method has a
jointly convex objective function. For efficiency, we
use an alternating method to learn the optimal model
parameters for each task as well as the relationships
between tasks. We study MTRL in the symmetric multitask
learning setting and then generalize it to the
asymmetric setting as well. We also discuss some
variants of the regularization approach to demonstrate
the use of other matrix-variate priors for learning
task relationships. Moreover, to gain more insight into
our model, we also study the relationships between MTRL
and some existing multitask learning methods.
Experiments conducted on a toy problem as well as
several benchmark datasets demonstrate the
effectiveness of MTRL as well as its high
interpretability revealed by the task covariance
matrix.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Lin:2014:SCR,
author = "Ming Lin and Shifeng Weng and Changshui Zhang",
title = "On the Sample Complexity of Random {Fourier} Features
for Online Learning: How Many Random {Fourier} Features
Do We Need?",
journal = j-TKDD,
volume = "8",
number = "3",
pages = "13:1--13:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2611378",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jun 3 13:50:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We study the sample complexity of random Fourier
features for online kernel learning-that is, the number
of random Fourier features required to achieve good
generalization performance. We show that when the loss
function is strongly convex and smooth, online kernel
learning with random Fourier features can achieve an $
O (l o g T / T) $ bound for the excess risk with only $
O (1 / \lambda^2)$ random Fourier features, where T is
the number of training examples and \lambda is the
modulus of strong convexity. This is a significant
improvement compared to the existing result for batch
kernel learning that requires $ O(T)$ random Fourier
features to achieve a generalization bound $ O(1 /
\sqrt T)$. Our empirical study verifies that online
kernel learning with a limited number of random Fourier
features can achieve similar generalization performance
as online learning using full kernel matrix. We also
present an enhanced online learning algorithm with
random Fourier features that improves the
classification performance by multiple passes of
training examples and a partial average.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Eyal:2014:PIM,
author = "Ron Eyal and Avi Rosenfeld and Sigal Sina and Sarit
Kraus",
title = "Predicting and Identifying Missing Node Information in
Social Networks",
journal = j-TKDD,
volume = "8",
number = "3",
pages = "14:1--14:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2536775",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Jun 26 05:48:23 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In recent years, social networks have surged in
popularity. One key aspect of social network research
is identifying important missing information that is
not explicitly represented in the network, or is not
visible to all. To date, this line of research
typically focused on finding the connections that are
missing between nodes, a challenge typically termed as
the link prediction problem. This article introduces
the missing node identification problem, where missing
members in the social network structure must be
identified. In this problem, indications of missing
nodes are assumed to exist. Given these indications and
a partial network, we must assess which indications
originate from the same missing node and determine the
full network structure. Toward solving this problem, we
present the missing node identification by spectral
clustering algorithm (MISC), an approach based on a
spectral clustering algorithm, combined with nodes'
pairwise affinity measures that were adopted from link
prediction research. We evaluate the performance of our
approach in different problem settings and scenarios,
using real-life data from Facebook. The results show
that our approach has beneficial results and can be
effective in solving the missing node identification
problem. In addition, this article also presents
R-MISC, which uses a sparse matrix representation,
efficient algorithms for calculating the nodes'
pairwise affinity, and a proprietary dimension
reduction technique to enable scaling the MISC
algorithm to large networks of more than 100,000 nodes.
Last, we consider problem settings where some of the
indications are unknown. Two algorithms are suggested
for this problem: speculative MISC, based on MISC, and
missing link completion, based on classical link
prediction literature. We show that speculative MISC
outperforms missing link completion.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Webb:2014:EDM,
author = "Geoffrey I. Webb and Jilles Vreeken",
title = "Efficient Discovery of the Most Interesting
Associations",
journal = j-TKDD,
volume = "8",
number = "3",
pages = "15:1--15:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2601433",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Thu Jun 26 05:48:23 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Self-sufficient itemsets have been proposed as an
effective approach to summarizing the key associations
in data. However, their computation appears highly
demanding, as assessing whether an itemset is
self-sufficient requires consideration of all pairwise
partitions of the itemset into pairs of subsets as well
as consideration of all supersets. This article
presents the first published algorithm for efficiently
discovering self-sufficient itemsets. This
branch-and-bound algorithm deploys two powerful pruning
mechanisms based on upper bounds on itemset value and
statistical significance level. It demonstrates that
finding top- k productive and nonredundant itemsets,
with postprocessing to identify those that are not
independently productive, can efficiently identify
small sets of key associations. We present extensive
evaluation of the strengths and limitations of the
technique, including comparisons with alternative
approaches to finding the most interesting
associations.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Shabtai:2014:ODM,
author = "Asaf Shabtai and Maya Bercovitch and Lior Rokach and
Yuval Elovici",
title = "Optimizing Data Misuse Detection",
journal = j-TKDD,
volume = "8",
number = "3",
pages = "16:1--16:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2611520",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jun 3 13:50:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Data misuse may be performed by entities such as an
organization's employees and business partners who are
granted access to sensitive information and misuse
their privileges. We assume that users can be either
trusted or untrusted. The access of untrusted parties
to data objects (e.g., client and patient records)
should be monitored in an attempt to detect misuse.
However, monitoring data objects is resource intensive
and time-consuming and may also cause disturbance or
inconvenience to the involved employees. Therefore, the
monitored data objects should be carefully selected. In
this article, we present two optimization problems
carefully designed for selecting specific data objects
for monitoring, such that the detection rate is
maximized and the monitoring effort is minimized. In
the first optimization problem, the goal is to select
data objects for monitoring that are accessed by at
most c trusted agents while ensuring access to at least
k monitored objects by each untrusted agent (both c and
k are integer variable). As opposed to the first
optimization problem, the goal of the second
optimization problem is to select monitored data
objects that maximize the number of monitored data
objects accessed by untrusted agents while ensuring
that each trusted agent does not access more than d
monitored data objects (d is an integer variable as
well). Two efficient heuristic algorithms for solving
these optimization problems are proposed, and
experiments were conducted simulating different
scenarios to evaluate the algorithms' performance.
Moreover, we compared the heuristic algorithms'
performance to the optimal solution and conducted
sensitivity analysis on the three parameters (c, k, and
d) and on the ratio between the trusted and untrusted
agents.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Hernandez-Orallo:2014:PRC,
author = "Jos{\'e} Hern{\'a}ndez-Orallo",
title = "Probabilistic Reframing for Cost-Sensitive
Regression",
journal = j-TKDD,
volume = "8",
number = "4",
pages = "17:1--17:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2641758",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:02 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Common-day applications of predictive models usually
involve the full use of the available contextual
information. When the operating context changes, one
may fine-tune the by-default (incontextual) prediction
or may even abstain from predicting a value (a reject).
Global reframing solutions, where the same function is
applied to adapt the estimated outputs to a new cost
context, are possible solutions here. An alternative
approach, which has not been studied in a comprehensive
way for regression in the knowledge discovery and data
mining literature, is the use of a local (e.g.,
probabilistic) reframing approach, where decisions are
made according to the estimated output and a
reliability, confidence, or probability estimation. In
this article, we advocate for a simple two-parameter
(mean and variance) approach, working with a normal
conditional probability density. Given the conditional
mean produced by any regression technique, we develop
lightweight ``enrichment'' methods that produce good
estimates of the conditional variance, which are used
by the probabilistic (local) reframing methods. We
apply these methods to some very common families of
cost-sensitive problems, such as optimal predictions in
(auction) bids, asymmetric loss scenarios, and
rejection rules.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Miettinen:2014:MMD,
author = "Pauli Miettinen and Jilles Vreeken",
title = "{MDL4BMF}: Minimum Description Length for {Boolean}
Matrix Factorization",
journal = j-TKDD,
volume = "8",
number = "4",
pages = "18:1--18:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2601437",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:45:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Matrix factorizations-where a given data matrix is
approximated by a product of two or more factor
matrices-are powerful data mining tools. Among other
tasks, matrix factorizations are often used to separate
global structure from noise. This, however, requires
solving the ``model order selection problem'' of
determining the proper rank of the factorization, that
is, to answer where fine-grained structure stops, and
where noise starts. Boolean Matrix Factorization
(BMF)-where data, factors, and matrix product are
Boolean-has in recent years received increased
attention from the data mining community. The technique
has desirable properties, such as high interpretability
and natural sparsity. Yet, so far no method for
selecting the correct model order for BMF has been
available. In this article, we propose the use of the
Minimum Description Length (MDL) principle for this
task. Besides solving the problem, this well-founded
approach has numerous benefits; for example, it is
automatic, does not require a likelihood function, is
fast, and, as experiments show, is highly accurate. We
formulate the description length function for BMF in
general-making it applicable for any BMF algorithm. We
discuss how to construct an appropriate encoding:
starting from a simple and intuitive approach, we
arrive at a highly efficient data-to-model--based
encoding for BMF. We extend an existing algorithm for
BMF to use MDL to identify the best Boolean matrix
factorization, analyze the complexity of the problem,
and perform an extensive experimental evaluation to
study its behavior.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Tang:2014:FSS,
author = "Jiliang Tang and Huan Liu",
title = "Feature Selection for Social Media Data",
journal = j-TKDD,
volume = "8",
number = "4",
pages = "19:1--19:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629587",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:45:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Feature selection is widely used in preparing
high-dimensional data for effective data mining. The
explosive popularity of social media produces massive
and high-dimensional data at an unprecedented rate,
presenting new challenges to feature selection. Social
media data consists of (1) traditional
high-dimensional, attribute-value data such as posts,
tweets, comments, and images, and (2) linked data that
provides social context for posts and describes the
relationships between social media users as well as who
generates the posts, and so on. The nature of social
media also determines that its data is massive, noisy,
and incomplete, which exacerbates the already
challenging problem of feature selection. In this
article, we study a novel feature selection problem of
selecting features for social media data with its
social context. In detail, we illustrate the
differences between attribute-value data and social
media data, investigate if linked data can be exploited
in a new feature selection framework by taking
advantage of social science theories. We design and
conduct experiments on datasets from real-world social
media Web sites, and the empirical results demonstrate
that the proposed framework can significantly improve
the performance of feature selection. Further
experiments are conducted to evaluate the effects of
user--user and user--post relationships manifested in
linked data on feature selection, and research issues
for future work will be discussed.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Riondato:2014:EDA,
author = "Matteo Riondato and Eli Upfal",
title = "Efficient Discovery of Association Rules and Frequent
Itemsets through Sampling with Tight Performance
Guarantees",
journal = j-TKDD,
volume = "8",
number = "4",
pages = "20:1--20:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629586",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:45:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The tasks of extracting (top- K ) Frequent Itemsets
(FIs) and Association Rules (ARs) are fundamental
primitives in data mining and database applications.
Exact algorithms for these problems exist and are
widely used, but their running time is hindered by the
need of scanning the entire dataset, possibly multiple
times. High-quality approximations of FIs and ARs are
sufficient for most practical uses. Sampling techniques
can be used for fast discovery of approximate
solutions, but works exploring this technique did not
provide satisfactory performance guarantees on the
quality of the approximation due to the difficulty of
bounding the probability of under- or oversampling any
one of an unknown number of frequent itemsets. We
circumvent this issue by applying the statistical
concept of Vapnik--Chervonenkis (VC) dimension to
develop a novel technique for providing tight bounds on
the sample size that guarantees approximation of the
(top- K ) FIs and ARs within user-specified parameters.
The resulting sample size is linearly dependent on the
VC-dimension of a range space associated with the
dataset. We analyze the VC-dimension of this range
space and show that it is upper bounded by an
easy-to-compute characteristic quantity of the dataset,
the d-index, namely, the maximum integer d such that
the dataset contains at least d transactions of length
at least d such that no one of them is a superset of or
equal to another. We show that this bound is tight for
a large class of datasets. The resulting sample size is
a significant improvement over previous known results.
We present an extensive experimental evaluation of our
technique on real and artificial datasets,
demonstrating the practicality of our methods, and
showing that they achieve even higher quality
approximations than what is guaranteed by the
analysis.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Burton:2014:DSC,
author = "Scott H. Burton and Christophe G. Giraud-Carrier",
title = "Discovering Social Circles in Directed Graphs",
journal = j-TKDD,
volume = "8",
number = "4",
pages = "21:1--21:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2641759",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:02 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We examine the problem of identifying social circles,
or sets of cohesive and mutually aware nodes
surrounding an initial query set, in directed graphs
where the complete graph is not known beforehand. This
problem differs from local community mining, in that
the query set defines the circle of interest. We
explicitly handle edge direction, as in many cases
relationships are not symmetric, and focus on the local
context because many real-world graphs cannot be
feasibly known. We outline several issues that are
unique to this context, introduce a quality function to
measure the value of including a particular node in an
emerging social circle, and describe a greedy social
circle discovery algorithm. We demonstrate the
effectiveness of this approach on artificial
benchmarks, large networks with topical community
labels, and several real-world case studies.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Paul:2014:RPL,
author = "Saurabh Paul and Christos Boutsidis and Malik
Magdon-Ismail and Petros Drineas",
title = "Random Projections for Linear Support Vector
Machines",
journal = j-TKDD,
volume = "8",
number = "4",
pages = "22:1--22:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2641760",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:45:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Let $X$ be a data matrix of rank $ \rho $, whose rows
represent $n$ points in $d$-dimensional space. The
linear support vector machine constructs a hyperplane
separator that maximizes the 1-norm soft margin. We
develop a new oblivious dimension reduction technique
that is precomputed and can be applied to any input
matrix $X$. We prove that, with high probability, the
margin and minimum enclosing ball in the feature space
are preserved to within $ \epsilon $-relative error,
ensuring comparable generalization as in the original
space in the case of classification. For regression, we
show that the margin is preserved to $ \epsilon
$-relative error with high probability. We present
extensive experiments with real and synthetic data to
support our theory.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Erdo:2014:RGN,
author = "D{\'o}ra Erd{\H{o}}s and Rainer Gemulla and Evimaria
Terzi",
title = "Reconstructing Graphs from Neighborhood Data",
journal = j-TKDD,
volume = "8",
number = "4",
pages = "23:1--23:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2641761",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:02 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Consider a social network and suppose that we are only
given the number of common friends between each pair of
users. Can we reconstruct the underlying network?
Similarly, consider a set of documents and the words
that appear in them. If we only know the number of
common words for every pair of documents, as well as
the number of common documents for every pair of words,
can we infer which words appear in which documents? In
this article, we develop a general methodology for
answering questions like these. We formalize these
questions in what we call the {\em R}econstruct
problem: given information about the common neighbors
of nodes in a network, our goal is to reconstruct the
hidden binary matrix that indicates the presence or
absence of relationships between individual nodes. In
fact, we propose two different variants of this
problem: one where the number of connections of every
node (i.e., the degree of every node) is known and a
second one where it is unknown. We call these variants
the degree-aware and the degree-oblivious versions of
the Reconstruct problem, respectively. Our algorithms
for both variants exploit the properties of the
singular value decomposition of the hidden binary
matrix. More specifically, we show that using the
available neighborhood information, we can reconstruct
the hidden matrix by finding the components of its
singular value decomposition and then combining them
appropriately. Our extensive experimental study
suggests that our methods are able to reconstruct
binary matrices of different characteristics with up to
100\% accuracy.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Acharya:2014:OFC,
author = "Ayan Acharya and Eduardo R. Hruschka and Joydeep Ghosh
and Sreangsu Acharyya",
title = "An Optimization Framework for Combining Ensembles of
Classifiers and Clusterers with Applications to
Nontransductive Semisupervised Learning and Transfer
Learning",
journal = j-TKDD,
volume = "9",
number = "1",
pages = "1:1--1:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2601435",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:05 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Unsupervised models can provide supplementary soft
constraints to help classify new ``target'' data
because similar instances in the target set are more
likely to share the same class label. Such models can
also help detect possible differences between training
and target distributions, which is useful in
applications where concept drift may take place, as in
transfer learning settings. This article describes a
general optimization framework that takes as input
class membership estimates from existing classifiers
learned on previously encountered ``source'' (or
training) data, as well as a similarity matrix from a
cluster ensemble operating solely on the target (or
test) data to be classified, and yields a consensus
labeling of the target data. More precisely, the
application settings considered are nontransductive
semisupervised and transfer learning scenarios where
the training data are used only to build an ensemble of
classifiers and are subsequently discarded before
classifying the target data. The framework admits a
wide range of loss functions and
classification/clustering methods. It exploits
properties of Bregman divergences in conjunction with
Legendre duality to yield a principled and scalable
approach. A variety of experiments show that the
proposed framework can yield results substantially
superior to those provided by na{\"\i}vely applying
classifiers learned on the original task to the target
data. In addition, we show that the proposed approach,
even not being conceptually transductive, can provide
better results compared to some popular transductive
learning techniques.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Boedihardjo:2014:FEL,
author = "Arnold P. Boedihardjo and Chang-Tien Lu and Bingsheng
Wang",
title = "A Framework for Exploiting Local Information to
Enhance Density Estimation of Data Streams",
journal = j-TKDD,
volume = "9",
number = "1",
pages = "2:1--2:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629618",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:05 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The Probability Density Function (PDF) is the
fundamental data model for a variety of stream mining
algorithms. Existing works apply the standard
nonparametric Kernel Density Estimator (KDE) to
approximate the PDF of data streams. As a result, the
stream-based KDEs cannot accurately capture complex
local density features. In this article, we propose the
use of Local Region (LRs) to model local density
information in univariate data streams. In-depth
theoretical analyses are presented to justify the
effectiveness of the LR-based KDE. Based on the
analyses, we develop the General Local rEgion AlgorithM
(GLEAM) to enhance the estimation quality of
structurally complex univariate distributions for
existing stream-based KDEs. A set of algorithmic
optimizations is designed to improve the query
throughput of GLEAM and to achieve its linear order
computation. Additionally, a comprehensive suite of
experiments was conducted to test the effectiveness and
efficiency of GLEAM.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ordonez:2014:BVS,
author = "Carlos Ordonez and Carlos Garcia-Alvarado and
Veerabhadaran Baladandayuthapani",
title = "{Bayesian} Variable Selection in Linear Regression in
One Pass for Large Datasets",
journal = j-TKDD,
volume = "9",
number = "1",
pages = "3:1--3:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629617",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:05 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Bayesian models are generally computed with Markov
Chain Monte Carlo (MCMC) methods. The main disadvantage
of MCMC methods is the large number of iterations they
need to sample the posterior distributions of model
parameters, especially for large datasets. On the other
hand, variable selection remains a challenging problem
due to its combinatorial search space, where Bayesian
models are a promising solution. In this work, we study
how to accelerate Bayesian model computation for
variable selection in linear regression. We propose a
fast Gibbs sampler algorithm, a widely used MCMC method
that incorporates several optimizations. We use a
Zellner prior for the regression coefficients, an
improper prior on variance, and a conjugate prior
Gaussian distribution, which enable dataset
summarization in one pass, thus exploiting an augmented
set of sufficient statistics. Thereafter, the algorithm
iterates in main memory. Sufficient statistics are
indexed with a sparse binary vector to efficiently
compute matrix projections based on selected variables.
Discovered variable subsets probabilities, selecting
and discarding each variable, are stored on a hash
table for fast retrieval in future iterations. We study
how to integrate our algorithm into a Database
Management System (DBMS), exploiting aggregate
User-Defined Functions for parallel data summarization
and stored procedures to manipulate matrices with
arrays. An experimental evaluation with real datasets
evaluates accuracy and time performance, comparing our
DBMS-based algorithm with the R package. Our algorithm
is shown to produce accurate results, scale linearly on
dataset size, and run orders of magnitude faster than
the R package.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Fei:2014:SSB,
author = "Hongliang Fei and Jun Huan",
title = "Structured Sparse Boosting for Graph Classification",
journal = j-TKDD,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629328",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:05 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Boosting is a highly effective algorithm that produces
a linear combination of weak classifiers (a.k.a. base
learners) to obtain high-quality classification models.
In this article, we propose a generalized logit boost
algorithm in which base learners have structural
relationships in the functional space. Although such
relationships are generic, our work is particularly
motivated by the emerging topic of pattern-based
classification for semistructured data including
graphs. Toward an efficient incorporation of the
structure information, we have designed a general model
in which we use an undirected graph to capture the
relationship of subgraph-based base learners. In our
method, we employ both L$_1$ and Laplacian-based L$_2$
regularization to logit boosting to achieve model
sparsity and smoothness in the functional space spanned
by the base learners. We have derived efficient
optimization algorithms based on coordinate descent for
the new boosting formulation and theoretically prove
that it exhibits a natural grouping effect for nearby
spatial or overlapping base learners and that the
resulting estimator is consistent. Additionally,
motivated by the connection between logit boosting and
logistic regression, we extend our structured sparse
regularization framework to logistic regression for
vectorial data in which features are structured. Using
comprehensive experimental study and comparing our work
with the state-of-the-art, we have demonstrated the
effectiveness of the proposed learning method.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Xu:2014:GGB,
author = "Zhiqiang Xu and Yiping Ke and Yi Wang and Hong Cheng
and James Cheng",
title = "{GBAGC}: a General {Bayesian} Framework for Attributed
Graph Clustering",
journal = j-TKDD,
volume = "9",
number = "1",
pages = "5:1--5:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629616",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:05 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Graph clustering, also known as community detection,
is a long-standing problem in data mining. In recent
years, with the proliferation of rich attribute
information available for objects in real-world graphs,
how to leverage not only structural but also attribute
information for clustering attributed graphs becomes a
new challenge. Most existing works took a
distance-based approach. They proposed various distance
measures to fuse structural and attribute information
and then applied standard techniques for graph
clustering based on these distance measures. In this
article, we take an alternative view and propose a
novel Bayesian framework for attributed graph
clustering. Our framework provides a general and
principled solution to modeling both the structural and
the attribute aspects of a graph. It avoids the
artificial design of a distance measure in existing
methods and, furthermore, can seamlessly handle graphs
with different types of edges and vertex attributes. We
develop an efficient variational method for graph
clustering under this framework and derive two concrete
algorithms for clustering unweighted and weighted
attributed graphs. Experimental results on large
real-world datasets show that our algorithms
significantly outperform the state-of-the-art
distance-based method, in terms of both effectiveness
and efficiency.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Coscia:2014:UHO,
author = "Michele Coscia and Giulio Rossetti and Fosca Giannotti
and Dino Pedreschi",
title = "Uncovering Hierarchical and Overlapping Communities
with a Local-First Approach",
journal = j-TKDD,
volume = "9",
number = "1",
pages = "6:1--6:??",
month = aug,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629511",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Aug 26 17:49:05 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Community discovery in complex networks is the task of
organizing a network's structure by grouping together
nodes related to each other. Traditional approaches are
based on the assumption that there is a global-level
organization in the network. However, in many
scenarios, each node is the bearer of complex
information and cannot be classified in disjoint
clusters. The top-down global view of the partition
approach is not designed for this. Here, we represent
this complex information as multiple latent labels, and
we postulate that edges in the networks are created
among nodes carrying similar labels. The latent labels
are the communities a node belongs to and we discover
them with a simple local-first approach to community
discovery. This is achieved by democratically letting
each node vote for the communities it sees surrounding
it in its limited view of the global system, its ego
neighborhood, using a label propagation algorithm,
assuming that each node is aware of the label it shares
with each of its connections. The local communities are
merged hierarchically, unveiling the modular
organization of the network at the global level and
identifying overlapping groups and groups of groups. We
tested this intuition against the state-of-the-art
overlapping community discovery and found that our new
method advances in the chosen scenarios in the quality
of the obtained communities. We perform a test on
benchmark and on real-world networks, evaluating the
quality of the community coverage by using the
extracted communities to predict the metadata attached
to the nodes, which we consider external information
about the latent labels. We also provide an explanation
about why real-world networks contain overlapping
communities and how our logic is able to capture them.
Finally, we show how our method is deterministic, is
incremental, and has a limited time complexity, so that
it can be used on real-world scale networks.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2014:GML,
author = "Guangtao Wang and Qinbao Song and Xueying Zhang and
Kaiyuan Zhang",
title = "A Generic Multilabel Learning-Based Classification
Algorithm Recommendation Method",
journal = j-TKDD,
volume = "9",
number = "1",
pages = "7:1--7:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629474",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Oct 10 17:19:10 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "As more and more classification algorithms continue to
be developed, recommending appropriate algorithms to a
given classification problem is increasingly important.
This article first distinguishes the algorithm
recommendation methods by two dimensions: (1)
meta-features, which are a set of measures used to
characterize the learning problems, and (2)
meta-target, which represents the relative performance
of the classification algorithms on the learning
problem. In contrast to the existing algorithm
recommendation methods whose meta-target is usually in
the form of either the ranking of candidate algorithms
or a single algorithm, this article proposes a new and
natural multilabel form to describe the meta-target.
This is due to the fact that there would be multiple
algorithms being appropriate for a given problem in
practice. Furthermore, a novel multilabel
learning-based generic algorithm recommendation method
is proposed, which views the algorithm recommendation
as a multilabel learning problem and solves the problem
by the mature multilabel learning algorithms. To
evaluate the proposed multilabel learning-based
recommendation method, extensive experiments with 13
well-known classification algorithms, two kinds of
meta-targets such as algorithm ranking and single
algorithm, and five different kinds of meta-features
are conducted on 1,090 benchmark learning problems. The
results show the effectiveness of our proposed
multilabel learning-based recommendation method.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2014:EEM,
author = "Pinghui Wang and John C. S. Lui and Bruno Ribeiro and
Don Towsley and Junzhou Zhao and Xiaohong Guan",
title = "Efficiently Estimating Motif Statistics of Large
Networks",
journal = j-TKDD,
volume = "9",
number = "2",
pages = "8:1--8:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629564",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:49:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Exploring statistics of locally connected subgraph
patterns (also known as network motifs) has helped
researchers better understand the structure and
function of biological and Online Social Networks
(OSNs). Nowadays, the massive size of some critical
networks-often stored in already overloaded relational
databases-effectively limits the rate at which nodes
and edges can be explored, making it a challenge to
accurately discover subgraph statistics. In this work,
we propose sampling methods to accurately estimate
subgraph statistics from as few queried nodes as
possible. We present sampling algorithms that
efficiently and accurately estimate subgraph properties
of massive networks. Our algorithms require no
precomputation or complete network topology
information. At the same time, we provide theoretical
guarantees of convergence. We perform experiments using
widely known datasets and show that, for the same
accuracy, our algorithms require an order of magnitude
less queries (samples) than the current
state-of-the-art algorithms.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zheng:2014:FHE,
author = "Li Zheng and Tao Li and Chris Ding",
title = "A Framework for Hierarchical Ensemble Clustering",
journal = j-TKDD,
volume = "9",
number = "2",
pages = "9:1--9:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2611380",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:49:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Ensemble clustering, as an important extension of the
clustering problem, refers to the problem of combining
different (input) clusterings of a given dataset to
generate a final (consensus) clustering that is a
better fit in some sense than existing clusterings.
Over the past few years, many ensemble clustering
approaches have been developed. However, most of them
are designed for partitional clustering methods, and
few research efforts have been reported for ensemble
hierarchical clustering methods. In this article, a
hierarchical ensemble clustering framework that can
naturally combine both partitional clustering and
hierarchical clustering results is proposed. In
addition, a novel method for learning the ultra-metric
distance from the aggregated distance matrices and
generating final hierarchical clustering with enhanced
cluster separation is developed based on the
ultra-metric distance for hierarchical clustering. We
study three important problems: dendrogram description,
dendrogram combination, and dendrogram selection. We
develop two approaches for dendrogram selection based
on tree distances, and we investigate various
dendrogram distances for representing dendrograms. We
provide a systematic empirical study of the ensemble
hierarchical clustering problem. Experimental results
demonstrate the effectiveness of our proposed
approaches.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Huai:2014:TPC,
author = "Baoxing Huai and Enhong Chen and Hengshu Zhu and Hui
Xiong and Tengfei Bao and Qi Liu and Jilei Tian",
title = "Toward Personalized Context Recognition for Mobile
Users: a Semisupervised {Bayesian} {HMM} Approach",
journal = j-TKDD,
volume = "9",
number = "2",
pages = "10:1--10:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629504",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:49:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The problem of mobile context recognition targets the
identification of semantic meaning of context in a
mobile environment. This plays an important role in
understanding mobile user behaviors and thus provides
the opportunity for the development of better
intelligent context-aware services. A key step of
context recognition is to model the personalized
contextual information of mobile users. Although many
studies have been devoted to mobile context modeling,
limited efforts have been made on the exploitation of
the sequential and dependency characteristics of mobile
contextual information. Also, the latent semantics
behind mobile context are often ambiguous and poorly
understood. Indeed, a promising direction is to
incorporate some domain knowledge of common contexts,
such as ``waiting for a bus'' or ``having dinner,'' by
modeling both labeled and unlabeled context data from
mobile users because there are often few labeled
contexts available in practice. To this end, in this
article, we propose a sequence-based semisupervised
approach to modeling personalized context for mobile
users. Specifically, we first exploit the Bayesian
Hidden Markov Model (B-HMM) for modeling context in the
form of probabilistic distributions and transitions of
raw context data. Also, we propose a sequential model
by extending B-HMM with the prior knowledge of
contextual features to model context more accurately.
Then, to efficiently learn the parameters and initial
values of the proposed models, we develop a novel
approach for parameter estimation by integrating the
Dirichlet Process Mixture (DPM) model and the Mixture
Unigram (MU) model. Furthermore, by incorporating both
user-labeled and unlabeled data, we propose a
semisupervised learning-based algorithm to identify and
model the latent semantics of context. Finally,
experimental results on real-world data clearly
validate both the efficiency and effectiveness of the
proposed approaches for recognizing personalized
context of mobile users.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Liu:2014:ADI,
author = "Siyuan Liu and Lei Chen and Lionel M. Ni",
title = "Anomaly Detection from Incomplete Data",
journal = j-TKDD,
volume = "9",
number = "2",
pages = "11:1--11:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629668",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:49:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Anomaly detection (a.k.a., outlier or burst detection)
is a well-motivated problem and a major data mining and
knowledge discovery task. In this article, we study the
problem of population anomaly detection, one of the key
issues related to event monitoring and population
management within a city. Through studying detected
population anomalies, we can trace and analyze these
anomalies, which could help to model city traffic
design and event impact analysis and prediction.
Although a significant and interesting issue, it is
very hard to detect population anomalies and retrieve
anomaly trajectories, especially given that it is
difficult to get actual and sufficient population data.
To address the difficulties of a lack of real
population data, we take advantage of mobile phone
networks, which offer enormous spatial and temporal
communication data on persons. More importantly, we
claim that we can utilize these mobile phone data to
infer and approximate population data. Thus, we can
study the population anomaly detection problem by
taking advantages of unique features hidden in mobile
phone data. In this article, we present a system to
conduct Population Anomaly Detection (PAD). First, we
propose an effective clustering method,
correlation-based clustering, to cluster the incomplete
location information from mobile phone data (i.e., from
mobile call volume distribution to population density
distribution). Then, we design an adaptive
parameter-free detection method, R-scan, to capture the
distributed dynamic anomalies. Finally, we devise an
efficient algorithm, BT-miner, to retrieve anomaly
trajectories. The experimental results from real-life
mobile phone data confirm the effectiveness and
efficiency of the proposed algorithms. Finally, the
proposed methods are realized as a pilot system in a
city in China.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Gundecha:2014:UVR,
author = "Pritam Gundecha and Geoffrey Barbier and Jiliang Tang
and Huan Liu",
title = "User Vulnerability and Its Reduction on a Social
Networking Site",
journal = j-TKDD,
volume = "9",
number = "2",
pages = "12:1--12:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2630421",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:49:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Privacy and security are major concerns for many users
of social media. When users share information (e.g.,
data and photos) with friends, they can make their
friends vulnerable to security and privacy breaches
with dire consequences. With the continuous expansion
of a user's social network, privacy settings alone are
often inadequate to protect a user's profile. In this
research, we aim to address some critical issues
related to privacy protection: (1) How can we measure
and assess individual users' vulnerability? (2) With
the diversity of one's social network friends, how can
one figure out an effective approach to maintaining
balance between vulnerability and social utility? In
this work, first we present a novel way to define
vulnerable friends from an individual user's
perspective. User vulnerability is dependent on whether
or not the user's friends' privacy settings protect the
friend and the individual's network of friends (which
includes the user). We show that it is feasible to
measure and assess user vulnerability and reduce one's
vulnerability without changing the structure of a
social networking site. The approach is to unfriend
one's most vulnerable friends. However, when such a
vulnerable friend is also socially important,
unfriending him or her would significantly reduce one's
own social status. We formulate this novel problem as
vulnerability minimization with social utility
constraints. We formally define the optimization
problem and provide an approximation algorithm with a
proven bound. Finally, we conduct a large-scale
evaluation of a new framework using a Facebook dataset.
We resort to experiments and observe how much
vulnerability an individual user can be decreased by
unfriending a vulnerable friend. We compare performance
of different unfriending strategies and discuss the
security risk of new friend requests. Additionally, by
employing different forms of social utility, we confirm
that the balance between user vulnerability and social
utility can be practically achieved.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Duan:2014:SRC,
author = "Lian Duan and W. Nick Street and Yanchi Liu and
Songhua Xu and Brook Wu",
title = "Selecting the Right Correlation Measure for Binary
Data",
journal = j-TKDD,
volume = "9",
number = "2",
pages = "13:1--13:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2637484",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:49:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Finding the most interesting correlations among items
is essential for problems in many commercial, medical,
and scientific domains. Although there are numerous
measures available for evaluating correlations,
different correlation measures provide drastically
different results. Piatetsky-Shapiro provided three
mandatory properties for any reasonable correlation
measure, and Tan et al. proposed several properties to
categorize correlation measures; however, it is still
hard for users to choose the desirable correlation
measures according to their needs. In order to solve
this problem, we explore the effectiveness problem in
three ways. First, we propose two desirable properties
and two optional properties for correlation measure
selection and study the property satisfaction for
different correlation measures. Second, we study
different techniques to adjust correlation measures and
propose two new correlation measures: the Simplified $
\chi^2 $ with Continuity Correction and the Simplified
$ \chi^2 $ with Support. Third, we analyze the upper
and lower bounds of different measures and categorize
them by the bound differences. Combining these three
directions, we provide guidelines for users to choose
the proper measure according to their needs.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Huang:2014:PBA,
author = "Hao Huang and Hong Qin and Shinjae Yoo and Dantong
Yu",
title = "Physics-Based Anomaly Detection Defined on Manifold
Space",
journal = j-TKDD,
volume = "9",
number = "2",
pages = "14:1--14:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2641574",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Oct 7 18:49:26 MDT 2014",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Current popular anomaly detection algorithms are
capable of detecting global anomalies but often fail to
distinguish local anomalies from normal instances.
Inspired by contemporary physics theory (i.e., heat
diffusion and quantum mechanics), we propose two
unsupervised anomaly detection algorithms. Building on
the embedding manifold derived from heat diffusion, we
devise Local Anomaly Descriptor (LAD), which faithfully
reveals the intrinsic neighborhood density. It uses a
scale-dependent umbrella operator to bridge global and
local properties, which makes LAD more informative
within an adaptive scope of neighborhood. To offer more
stability of local density measurement on scaling
parameter tuning, we formulate Fermi Density Descriptor
(FDD), which measures the probability of a fermion
particle being at a specific location. By choosing the
stable energy distribution function, FDD steadily
distinguishes anomalies from normal instances with any
scaling parameter setting. To further enhance the
efficacy of our proposed algorithms, we explore the
utility of anisotropic Gaussian kernel (AGK), which
offers better manifold-aware affinity information. We
also quantify and examine the effect of different
Laplacian normalizations for anomaly detection.
Comprehensive experiments on both synthetic and
benchmark datasets verify that our proposed algorithms
outperform the existing anomaly detection algorithms.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Gionis:2015:ISI,
author = "Aristides Gionis and Hang Li",
title = "Introduction to the Special Issue {{ACM} {SIGKDD}} 2013",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "15:1--15:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700993",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/; http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
acknowledgement = ack-nhfb,
articleno = "15e",
fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Jha:2015:SES,
author = "Madhav Jha and C. Seshadhri and Ali Pinar",
title = "A Space-Efficient Streaming Algorithm for Estimating
Transitivity and Triangle Counts Using the Birthday
Paradox",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "15:1--15:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700395",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 6 09:34:37 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We design a space-efficient algorithm that
approximates the transitivity (global clustering
coefficient) and total triangle count with only a
single pass through a graph given as a stream of edges.
Our procedure is based on the classic probabilistic
result, the birthday paradox. When the transitivity is
constant and there are more edges than wedges (common
properties for social networks), we can prove that our
algorithm requires $O( \sqrt n )$ space ($n$ is the
number of vertices) to provide accurate estimates. We
run a detailed set of experiments on a variety of real
graphs and demonstrate that the memory requirement of
the algorithm is a tiny fraction of the graph. For
example, even for a graph with 200 million edges, our
algorithm stores just 40,000 edges to give accurate
results. Being a single pass streaming algorithm, our
procedure also maintains a real-time estimate of the
transitivity/number of triangles of a graph by storing
a minuscule fraction of edges.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Tang:2015:FMT,
author = "Lu-An Tang and Xiao Yu and Quanquan Gu and Jiawei Han
and Guofei Jiang and Alice Leung and Thomas {La
Porta}",
title = "A Framework of Mining Trajectories from Untrustworthy
Data in Cyber-Physical System",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "16:1--16:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700394",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 6 09:34:37 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "A cyber-physical system (CPS) integrates physical
(i.e., sensor) devices with cyber (i.e., informational)
components to form a context-sensitive system that
responds intelligently to dynamic changes in real-world
situations. The CPS has wide applications in scenarios
such as environment monitoring, battlefield
surveillance, and traffic control. One key research
problem of CPS is called mining lines in the sand. With
a large number of sensors (sand) deployed in a
designated area, the CPS is required to discover all
trajectories (lines) of passing intruders in real time.
There are two crucial challenges that need to be
addressed: (1) the collected sensor data are not
trustworthy, and (2) the intruders do not send out any
identification information. The system needs to
distinguish multiple intruders and track their
movements. This study proposes a method called LiSM
(Line-in-the-Sand Miner) to discover trajectories from
untrustworthy sensor data. LiSM constructs a watching
network from sensor data and computes the locations of
intruder appearances based on the link information of
the network. The system retrieves a cone model from the
historical trajectories to track multiple intruders.
Finally, the system validates the mining results and
updates sensors' reliability scores in a feedback
process. In addition, LoRM (Line-on-the-Road Miner) is
proposed for trajectory discovery on road networks-
mining lines on the roads. LoRM employs a
filtering-and-refinement framework to reduce the
distance computational overhead on road networks and
uses a shortest-path-measure to track intruders. The
proposed methods are evaluated with extensive
experiments on big datasets. The experimental results
show that the proposed methods achieve higher accuracy
and efficiency in trajectory mining tasks.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2015:QDR,
author = "Zheng Wang and Jieping Ye",
title = "Querying Discriminative and Representative Samples for
Batch Mode Active Learning",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "17:1--17:??",
month = feb,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700408",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Fri Mar 6 09:34:37 MST 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Empirical risk minimization (ERM) provides a
principled guideline for many machine learning and data
mining algorithms. Under the ERM principle, one
minimizes an upper bound of the true risk, which is
approximated by the summation of empirical risk and the
complexity of the candidate classifier class. To
guarantee a satisfactory learning performance, ERM
requires that the training data are i.i.d. sampled from
the unknown source distribution. However, this may not
be the case in active learning, where one selects the
most informative samples to label, and these data may
not follow the source distribution. In this article, we
generalize the ERM principle to the active learning
setting. We derive a novel form of upper bound for the
true risk in the active learning setting; by minimizing
this upper bound, we develop a practical batch mode
active learning method. The proposed formulation
involves a nonconvex integer programming optimization
problem. We solve it efficiently by an alternating
optimization method. Our method is shown to query the
most informative samples while preserving the source
distribution as much as possible, thus identifying the
most uncertain and representative queries. We further
extend our method to multiclass active learning by
introducing novel pseudolabels in the multiclass case
and developing an efficient algorithm. Experiments on
benchmark datasets and real-world applications
demonstrate the superior performance of our proposed
method compared to state-of-the-art methods.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Gopal:2015:HBI,
author = "Siddharth Gopal and Yiming Yang",
title = "Hierarchical {Bayesian} Inference and Recursive
Regularization for Large-Scale Classification",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "18:1--18:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2629585",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In this article, we address open challenges in
large-scale classification, focusing on how to
effectively leverage the dependency structures
(hierarchical or graphical) among class labels, and how
to make the inference scalable in jointly optimizing
all model parameters. We propose two main approaches,
namely the hierarchical Bayesian inference framework
and the recursive regularization scheme. The key idea
in both approaches is to reinforce the similarity among
parameter across the nodes in a hierarchy or network
based on the proximity and connectivity of the nodes.
For scalability, we develop hierarchical variational
inference algorithms and fast dual coordinate descent
training procedures with parallelization. In our
experiments for classification problems with hundreds
of thousands of classes and millions of training
instances with terabytes of parameters, the proposed
methods show consistent and statistically significant
improvements over other competing approaches, and the
best results on multiple benchmark datasets for
large-scale classification.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Yin:2015:MLB,
author = "Hongzhi Yin and Bin Cui and Ling Chen and Zhiting Hu
and Chengqi Zhang",
title = "Modeling Location-Based User Rating Profiles for
Personalized Recommendation",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "19:1--19:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2663356",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "This article proposes LA-LDA, a location-aware
probabilistic generative model that exploits
location-based ratings to model user profiles and
produce recommendations. Most of the existing
recommendation models do not consider the spatial
information of users or items; however, LA-LDA supports
three classes of location-based ratings, namely spatial
user ratings for nonspatial items, nonspatial user
ratings for spatial items, and spatial user ratings for
spatial items. LA-LDA consists of two components,
ULA-LDA and ILA-LDA, which are designed to take into
account user and item location information,
respectively. The component ULA-LDA explicitly
incorporates and quantifies the influence from local
public preferences to produce recommendations by
considering user home locations, whereas the component
ILA-LDA recommends items that are closer in both taste
and travel distance to the querying users by capturing
item co-occurrence patterns, as well as item location
co-occurrence patterns. The two components of LA-LDA
can be applied either separately or collectively,
depending on the available types of location-based
ratings. To demonstrate the applicability and
flexibility of the LA-LDA model, we deploy it to both
top- k recommendation and cold start recommendation
scenarios. Experimental evidence on large-scale
real-world data, including the data from Gowalla (a
location-based social network), DoubanEvent (an
event-based social network), and MovieLens (a movie
recommendation system), reveal that LA-LDA models user
profiles more accurately by outperforming existing
recommendation models for top- k recommendation and the
cold start problem.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Hu:2015:PSD,
author = "Juhua Hu and De-Chuan Zhan and Xintao Wu and Yuan
Jiang and Zhi-Hua Zhou",
title = "Pairwised Specific Distance Learning from Physical
Linkages",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "20:1--20:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700405",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In real tasks, usually a good classification
performance can only be obtained when a good distance
metric is obtained; therefore, distance metric learning
has attracted significant attention in the past few
years. Typical studies of distance metric learning
evaluate how to construct an appropriate distance
metric that is able to separate training data points
from different classes or satisfy a set of constraints
(e.g., must-links and/or cannot-links). It is
noteworthy that this task becomes challenging when
there are only limited labeled training data points and
no constraints are given explicitly. Moreover, most
existing approaches aim to construct a global distance
metric that is applicable to all data points. However,
different data points may have different properties and
may require different distance metrics. We notice that
data points in real tasks are often connected by
physical links (e.g., people are linked with each other
in social networks; personal webpages are often
connected to other webpages, including nonpersonal
webpages), but the linkage information has not been
exploited in distance metric learning. In this article,
we develop a pairwised specific distance (PSD) approach
that exploits the structures of physical linkages and
in particular captures the key observations that
nonmetric and clique linkages imply the appearance of
different or unique semantics, respectively. It is
noteworthy that, rather than generating a global
distance, PSD generates different distances for
different pairs of data points; this property is
desired in applications involving complicated data
semantics. We mainly present PSD for multi-class
learning and further extend it to multi-label learning.
Experimental results validate the effectiveness of PSD,
especially in the scenarios in which there are very
limited labeled training data points and no explicit
constraints are given.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Soundarajan:2015:ULG,
author = "Sucheta Soundarajan and John E. Hopcroft",
title = "Use of Local Group Information to Identify Communities
in Networks",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "21:1--21:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700404",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The recent interest in networks has inspired a broad
range of work on algorithms and techniques to
characterize, identify, and extract communities from
networks. Such efforts are complicated by a lack of
consensus on what a ``community'' truly is, and these
disagreements have led to a wide variety of
mathematical formulations for describing communities.
Often, these mathematical formulations, such as
modularity and conductance, have been founded in the
general principle that communities, like a G ( n, p )
graph, are ``round,'' with connections throughout the
entire community, and so algorithms were developed to
optimize such mathematical measures. More recently, a
variety of algorithms have been developed that, rather
than expecting connectivity through the entire
community, seek out very small groups of well-connected
nodes and then connect these groups into larger
communities. In this article, we examine seven real
networks, each containing external annotation that
allows us to identify ``annotated communities.'' A
study of these annotated communities gives insight into
why the second category of community detection
algorithms may be more successful than the first
category. We then present a flexible algorithm template
that is based on the idea of joining together small
sets of nodes. In this template, we first identify very
small, tightly connected ``subcommunities'' of nodes,
each corresponding to a single node's ``perception'' of
the network around it. We then create a new network in
which each node represents such a subcommunity, and
then identify communities in this new network. Because
each node can appear in multiple subcommunities, this
method allows us to detect overlapping communities.
When evaluated on real data, we show that our template
outperforms many other state-of-the-art algorithms.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2015:UCN,
author = "Pinghui Wang and Junzhou Zhao and John C. S. Lui and
Don Towsley and Xiaohong Guan",
title = "Unbiased Characterization of Node Pairs over Large
Graphs",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "22:1--22:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700393",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Characterizing user pair relationships is important
for applications such as friend recommendation and
interest targeting in online social networks (OSNs).
Due to the large-scale nature of such networks, it is
infeasible to enumerate all user pairs and thus
sampling is used. In this article, we show that it is a
great challenge for OSN service providers to
characterize user pair relationships, even when they
possess the complete graph topology. The reason is that
when sampling techniques (i.e., uniform vertex sampling
(UVS) and random walk (RW)) are naively applied, they
can introduce large biases, particularly for estimating
similarity distribution of user pairs with constraints
like existence of mutual neighbors, which is important
for applications such as identifying network homophily.
Estimating statistics of user pairs is more challenging
in the absence of the complete topology information, as
an unbiased sampling technique like UVS is usually not
allowed and exploring the OSN graph topology is
expensive. To address these challenges, we present
unbiased sampling methods to characterize user pair
properties based on UVS and RW techniques. We carry out
an evaluation of our methods to show their accuracy and
efficiency. Finally, we apply our methods to three
OSNs-Foursquare, Douban, and Xiami-and discover that
significant homophily is present in these networks.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Vlachos:2015:DPC,
author = "Michail Vlachos and Johannes Schneider and Vassilios
G. Vassiliadis",
title = "On Data Publishing with Clustering Preservation",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "23:1--23:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700403",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The emergence of cloud-based storage services is
opening up new avenues in data exchange and data
dissemination. This has amplified the interest in
right-protection mechanisms to establish ownership in
the event of data leakage. Current right-protection
technologies, however, rarely provide strong guarantees
on dataset utility after the protection process. This
work presents techniques that explicitly address this
topic and provably preserve the outcome of certain
mining operations. In particular, we take special care
to guarantee that the outcome of hierarchical
clustering operations remains the same before and after
right protection. Our approach considers all prevalent
hierarchical clustering variants: single-, complete-,
and average-linkage. We imprint the ownership in a
dataset using watermarking principles, and we derive
tight bounds on the expansion/contraction of distances
incurred by the process. We leverage our analysis to
design fast algorithms for right protection without
exhaustively searching the vast design space. Finally,
because the right-protection process introduces a
user-tunable distortion on the dataset, we explore the
possibility of using this mechanism for data
obfuscation. We quantify the tradeoff between
obfuscation and utility for spatiotemporal datasets and
discover very favorable characteristics of the process.
An additional advantage is that when one is interested
in both right-protecting and obfuscating the original
data values, the proposed mechanism can accomplish both
tasks simultaneously.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{VazDeMelo:2015:UDP,
author = "Pedro O. S. {Vaz De Melo} and Christos Faloutsos and
Renato Assun{\c{c}}{\~a}o and Rodrigo Alves and Antonio
A. F. Loureiro",
title = "Universal and Distinct Properties of Communication
Dynamics: How to Generate Realistic Inter-event Times",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "24:1--24:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700399",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "With the advancement of information systems, means of
communications are becoming cheaper, faster, and more
available. Today, millions of people carrying
smartphones or tablets are able to communicate
practically any time and anywhere they want. They can
access their e-mails, comment on weblogs, watch and
post videos and photos (as well as comment on them),
and make phone calls or text messages almost
ubiquitously. Given this scenario, in this article, we
tackle a fundamental aspect of this new era of
communication: How the time intervals between
communication events behave for different technologies
and means of communications. Are there universal
patterns for the Inter-Event Time Distribution (IED)?
How do inter-event times behave differently among
particular technologies? To answer these questions, we
analyzed eight different datasets from real and modern
communication data and found four well-defined patterns
seen in all the eight datasets. Moreover, we propose
the use of the Self-Feeding Process (SFP) to generate
inter-event times between communications. The SFP is an
extremely parsimonious point process that requires at
most two parameters and is able to generate inter-event
times with all the universal properties we observed in
the data. We also show three potential applications of
the SFP: as a framework to generate a synthetic dataset
containing realistic communication events of any one of
the analyzed means of communications, as a technique to
detect anomalies, and as a building block for more
specific models that aim to encompass the
particularities seen in each of the analyzed systems.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhang:2015:WIY,
author = "Jing Zhang and Jie Tang and Juanzi Li and Yang Liu and
Chunxiao Xing",
title = "Who Influenced You? {Predicting} Retweet via Social
Influence Locality",
journal = j-TKDD,
volume = "9",
number = "3",
pages = "25:1--25:??",
month = apr,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700398",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Apr 14 09:22:28 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Social influence occurs when one's opinions, emotions,
or behaviors are affected by others in a social
network. However, social influence takes many forms,
and its underlying mechanism is still unclear. For
example, how is one's behavior influenced by a group of
friends who know each other and by the friends from
different ego friend circles? In this article, we study
the social influence problem in a large microblogging
network. Particularly, we consider users' (re)tweet
behaviors and focus on investigating how friends in
one's ego network influence retweet behaviors. We
propose a novel notion of social influence locality and
develop two instantiation functions based on pairwise
influence and structural diversity. The defined
influence locality functions have strong predictive
power. Without any additional features, we can obtain
an F1-score of 71.65\% for predicting users' retweet
behaviors by training a logistic regression classifier
based on the defined influence locality functions. We
incorporate social influence locality into a factor
graph model, which can further leverage the
network-based correlation. Our experiments on the large
microblogging network show that the model significantly
improves the precision of retweet prediction. Our
analysis also reveals several intriguing discoveries.
For example, if you have six friends retweeting a
microblog, the average likelihood that you will also
retweet it strongly depends on the structure among the
six friends: The likelihood will significantly drop
(only 1/6 ) when the six friends do not know each
other, compared with the case when the six friends know
each other.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Xie:2015:MMA,
author = "Hong Xie and John C. S. Lui",
title = "Mathematical Modeling and Analysis of Product Rating
with Partial Information",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "26:1--26:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700386",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Many Web services like Amazon, Epinions, and
TripAdvisor provide historical product ratings so that
users can evaluate the quality of products. Product
ratings are important because they affect how well a
product will be adopted by the market. The challenge is
that we only have partial information on these ratings:
each user assigns ratings to only a small subset of
products. Under this partial information setting, we
explore a number of fundamental questions. What is the
minimum number of ratings a product needs so that one
can make a reliable evaluation of its quality? How may
users' misbehavior, such as cheating in product rating,
affect the evaluation result? To answer these
questions, we present a probabilistic model to capture
various important factors (e.g., rating aggregation
rules, rating behavior) that may influence the product
quality assessment under the partial information
setting. We derive the minimum number of ratings needed
to produce a reliable indicator on the quality of a
product. We extend our model to accommodate users'
misbehavior in product rating. We derive the maximum
fraction of misbehaving users that a rating aggregation
rule can tolerate and the minimum number of ratings
needed to compensate. We carry out experiments using
both synthetic and real-world data (from Amazon and
TripAdvisor). We not only validate our model but also
show that the ``average rating rule'' produces more
reliable and robust product quality assessments than
the ``majority rating rule'' and the ``median rating
rule'' in aggregating product ratings. Last, we perform
experiments on two movie rating datasets (from Flixster
and Netflix) to demonstrate how to apply our framework
to improve the applications of recommender systems.",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Esuli:2015:OTQ,
author = "Andrea Esuli and Fabrizio Sebastiani",
title = "Optimizing Text Quantifiers for Multivariate Loss
Functions",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "27:1--27:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700406",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We address the problem of quantification, a supervised
learning task whose goal is, given a class, to estimate
the relative frequency (or prevalence ) of the class in
a dataset of unlabeled items. Quantification has
several applications in data and text mining, such as
estimating the prevalence of positive reviews in a set
of reviews of a given product or estimating the
prevalence of a given support issue in a dataset of
transcripts of phone calls to tech support. So far,
quantification has been addressed by learning a
general-purpose classifier, counting the unlabeled
items that have been assigned the class, and tuning the
obtained counts according to some heuristics. In this
article, we depart from the tradition of using
general-purpose classifiers and use instead a
supervised learning model for structured prediction,
capable of generating classifiers directly optimized
for the (multivariate and nonlinear) function used for
evaluating quantification accuracy. The experiments
that we have run on 5,500 binary high-dimensional
datasets (averaging more than 14,000 documents each)
show that this method is more accurate, more stable,
and more efficient than existing state-of-the-art
quantification methods.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Lin:2015:IMS,
author = "Bing-Rong Lin and Daniel Kifer",
title = "Information Measures in Statistical Privacy and Data
Processing Applications",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "28:1--28:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700407",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "In statistical privacy, utility refers to two
concepts: information preservation, how much
statistical information is retained by a sanitizing
algorithm, and usability, how (and with how much
difficulty) one extracts this information to build
statistical models, answer queries, and so forth. Some
scenarios incentivize a separation between information
preservation and usability, so that the data owner
first chooses a sanitizing algorithm to maximize a
measure of information preservation, and, afterward,
the data consumers process the sanitized output
according to their various individual needs [Ghosh et
al. 2009; Williams and McSherry 2010]. We analyze the
information-preserving properties of utility measures
with a combination of two new and three existing
utility axioms and study how violations of an axiom can
be fixed. We show that the average (over possible
outputs of the sanitizer) error of Bayesian decision
makers forms the unique class of utility measures that
satisfy all of the axioms. The axioms are agnostic to
Bayesian concepts such as subjective probabilities and
hence strengthen support for Bayesian views in privacy
research. In particular, this result connects
information preservation to aspects of usability-if the
information preservation of a sanitizing algorithm
should be measured as the average error of a Bayesian
decision maker, shouldn't Bayesian decision theory be a
good choice when it comes to using the sanitized
outputs for various purposes? We put this idea to the
test in the unattributed histogram problem where our
decision-theoretic postprocessing algorithm empirically
outperforms previously proposed approaches.",
acknowledgement = ack-nhfb,
articleno = "28",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Huang:2015:DAC,
author = "Hao Huang and Shinjae Yoo and Dantong Yu and Hong
Qin",
title = "Density-Aware Clustering Based on Aggregated Heat
Kernel and Its Transformation",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "29:1--29:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700385",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Current spectral clustering algorithms suffer from the
sensitivity to existing noise and parameter scaling and
may not be aware of different density distributions
across clusters. If these problems are left untreated,
the consequent clustering results cannot accurately
represent true data patterns, in particular, for
complex real-world datasets with heterogeneous
densities. This article aims to solve these problems by
proposing a diffusion-based Aggregated Heat Kernel
(AHK) to improve the clustering stability, and a Local
Density Affinity Transformation (LDAT) to correct the
bias originating from different cluster densities. AHK
statistically models the heat diffusion traces along
the entire time scale, so it ensures robustness during
the clustering process, while LDAT probabilistically
reveals the local density of each instance and
suppresses the local density bias in the affinity
matrix. Our proposed framework integrates these two
techniques systematically. As a result, it not only
provides an advanced noise-resisting and density-aware
spectral mapping to the original dataset but also
demonstrates the stability during the processing of
tuning the scaling parameter (which usually controls
the range of neighborhood). Furthermore, our framework
works well with the majority of similarity kernels,
which ensures its applicability to many types of data
and problem domains. The systematic experiments on
different applications show that our proposed algorithm
outperforms state-of-the-art clustering algorithms for
the data with heterogeneous density distributions and
achieves robust clustering performance with respect to
tuning the scaling parameter and handling various
levels and types of noise.",
acknowledgement = ack-nhfb,
articleno = "29",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Yu:2015:CSF,
author = "Kui Yu and Wei Ding and Dan A. Simovici and Hao Wang
and Jian Pei and Xindong Wu",
title = "Classification with Streaming Features: an
Emerging-Pattern Mining Approach",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "30:1--30:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2700409",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Many datasets from real-world applications have very
high-dimensional or increasing feature space. It is a
new research problem to learn and maintain a classifier
to deal with very high dimensionality or streaming
features. In this article, we adapt the well-known
emerging-pattern--based classification models and
propose a semi-streaming approach. For streaming
features, it is computationally expensive or even
prohibitive to mine long-emerging patterns, and it is
nontrivial to integrate emerging-pattern mining with
feature selection. We present an online feature
selection step, which is capable of selecting and
maintaining a pool of effective features from a feature
stream. Then, in our offline step, separated from the
online step, we periodically compute and update
emerging patterns from the pool of selected features
from the online step. We evaluate the effectiveness and
efficiency of the proposed method using a series of
benchmark datasets and a real-world case study on Mars
crater detection. Our proposed method yields
classification performance comparable to the
state-of-art static classification methods. Most
important, the proposed method is significantly faster
and can efficiently handle datasets with streaming
features.",
acknowledgement = ack-nhfb,
articleno = "30",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Liu:2015:SEH,
author = "Guimei Liu and Haojun Zhang and Mengling Feng and
Limsoon Wong and See-Kiong Ng",
title = "Supporting Exploratory Hypothesis Testing and
Analysis",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "31:1--31:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2701430",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Conventional hypothesis testing is carried out in a
hypothesis-driven manner. A scientist must first
formulate a hypothesis based on what he or she sees and
then devise a variety of experiments to test it. Given
the rapid growth of data, it has become virtually
impossible for a person to manually inspect all data to
find all of the interesting hypotheses for testing. In
this article, we propose and develop a data-driven
framework for automatic hypothesis testing and
analysis. We define a hypothesis as a comparison
between two or more subpopulations. We find
subpopulations for comparison using frequent pattern
mining techniques and then pair them up for statistical
hypothesis testing. We also generate additional
information for further analysis of the hypotheses that
are deemed significant. The number of hypotheses
generated can be very large, and many of them are very
similar. We develop algorithms to remove redundant
hypotheses and present a succinct set of significant
hypotheses to users. We conducted a set of experiments
to show the efficiency and effectiveness of the
proposed algorithms. The results show that our system
can help users (1) identify significant hypotheses
efficiently, (2) isolate the reasons behind significant
hypotheses efficiently, and (3) find confounding
factors that form Simpson's paradoxes with discovered
significant hypotheses.",
acknowledgement = ack-nhfb,
articleno = "31",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Greco:2015:PDU,
author = "Gianluigi Greco and Antonella Guzzo and Francesco
Lupia and Luigi Pontieri",
title = "Process Discovery under Precedence Constraints",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "32:1--32:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2710020",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Process discovery has emerged as a powerful approach
to support the analysis and the design of complex
processes. It consists of analyzing a set of traces
registering the sequence of tasks performed along
several enactments of a transactional system, in order
to build a process model that can explain all the
episodes recorded over them. An approach to accomplish
this task is presented that can benefit from the
background knowledge that, in many cases, is available
to the analysts taking care of the process (re-)design.
The approach is based on encoding the information
gathered from the log and the (possibly) given
background knowledge in terms of precedence
constraints, that is, of constraints over the topology
of the resulting process models. Mining algorithms are
eventually formulated in terms of reasoning problems
over precedence constraints, and the computational
complexity of such problems is thoroughly analyzed by
tracing their tractability frontier. Solution
algorithms are proposed and their properties analyzed.
These algorithms have been implemented in a prototype
system, and results of a thorough experimental activity
are discussed.",
acknowledgement = ack-nhfb,
articleno = "32",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Mirbakhsh:2015:ITR,
author = "Nima Mirbakhsh and Charles X. Ling",
title = "Improving Top-{$N$} Recommendation for Cold-Start
Users via Cross-Domain Information",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "33:1--33:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2724720",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Making accurate recommendations for cold-start users
is a challenging yet important problem in
recommendation systems. Including more information from
other domains is a natural solution to improve the
recommendations. However, most previous work in
cross-domain recommendations has focused on improving
prediction accuracy with several severe limitations. In
this article, we extend our previous work on
clustering-based matrix factorization in single domains
into cross domains. In addition, we utilize recent
results on unobserved ratings. Our new method can more
effectively utilize data from auxiliary domains to
achieve better recommendations, especially for
cold-start users. For example, our method improves the
recall to 21\% on average for cold-start users, whereas
previous methods result in only 15\% recall in the
cross-domain Amazon dataset. We also observe almost the
same improvements in the Epinions dataset. Considering
that it is often difficult to make even a small
improvement in recommendations, for cold-start users in
particular, our result is quite significant.",
acknowledgement = ack-nhfb,
articleno = "33",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Bonchi:2015:CCC,
author = "Francesco Bonchi and Aristides Gionis and Francesco
Gullo and Charalampos E. Tsourakakis and Antti
Ukkonen",
title = "Chromatic Correlation Clustering",
journal = j-TKDD,
volume = "9",
number = "4",
pages = "34:1--34:??",
month = jun,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2728170",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Wed Jun 3 06:21:22 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "We study a novel clustering problem in which the
pairwise relations between objects are categorical.
This problem can be viewed as clustering the vertices
of a graph whose edges are of different types ( colors
). We introduce an objective function that ensures the
edges within each cluster have, as much as possible,
the same color. We show that the problem is NP -hard
and propose a randomized algorithm with approximation
guarantee proportional to the maximum degree of the
input graph. The algorithm iteratively picks a random
edge as a pivot, builds a cluster around it, and
removes the cluster from the graph. Although being
fast, easy to implement, and parameter-free, this
algorithm tends to produce a relatively large number of
clusters. To overcome this issue we introduce a variant
algorithm, which modifies how the pivot is chosen and
how the cluster is built around the pivot. Finally, to
address the case where a fixed number of output
clusters is required, we devise a third algorithm that
directly optimizes the objective function based on the
alternating-minimization paradigm. We also extend our
objective function to handle cases where object's
relations are described by multiple labels. We modify
our randomized approximation algorithm to optimize such
an extended objective function and show that its
approximation guarantee remains proportional to the
maximum degree of the graph. We test our algorithms on
synthetic and real data from the domains of social
media, protein-interaction networks, and bibliometrics.
Results reveal that our algorithms outperform a
baseline algorithm both in the task of reconstructing a
ground-truth clustering and in terms of
objective-function value.",
acknowledgement = ack-nhfb,
articleno = "34",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wang:2015:LSC,
author = "Hua Wang and Feiping Nie and Heng Huang",
title = "Large-Scale Cross-Language {Web} Page Classification
via Dual Knowledge Transfer Using Fast Nonnegative
Matrix Trifactorization",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "1:1--1:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2710021",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "With the rapid growth of modern technologies, Internet
has reached almost every corner of the world. As a
result, it becomes more and more important to manage
and mine information contained in Web pages in
different languages. Traditional supervised learning
methods usually require a large amount of training data
to obtain accurate and robust classification models.
However, labeled Web pages did not increase as fast as
the growth of Internet. The lack of sufficient training
Web pages in many languages, especially for those in
uncommonly used languages, makes it a challenge for
traditional classification algorithms to achieve
satisfactory performance. To address this, we observe
that Web pages for a same topic from different
languages usually share some common semantic patterns,
though in different representation forms. In addition,
we also observe that the associations between word
clusters and Web page classes are another type of
reliable carriers to transfer knowledge across
languages. With these recognitions, in this article we
propose a novel joint nonnegative matrix
trifactorization (NMTF) based Dual Knowledge Transfer
(DKT) approach for cross-language Web page
classification. Our approach transfers knowledge from
the auxiliary language, in which abundant labeled Web
pages are available, to the target languages, in which
we want to classify Web pages, through two different
paths: word cluster approximation and the associations
between word clusters and Web page classes. With the
reinforcement between these two different knowledge
transfer paths, our approach can achieve better
classification accuracy. In order to deal with the
large-scale real world data, we further develop the
proposed DKT approach by constraining the factor
matrices of NMTF to be cluster indicator matrices. Due
to the nature of cluster indicator matrices, we can
decouple the proposed optimization objective and the
resulted subproblems are of much smaller sizes
involving much less matrix multiplications, which make
our new approach much more computationally efficient.
We evaluate the proposed approach in extensive
experiments using a real world cross-language Web page
data set. Promising results have demonstrated the
effectiveness of our approach that are consistent with
our theoretical analyses.",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhou:2015:SIB,
author = "Yang Zhou and Ling Liu",
title = "Social Influence Based Clustering and Optimization
over Heterogeneous Information Networks",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "2:1--2:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2717314",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Social influence analysis has shown great potential
for strategic marketing decision. It is well known that
people influence one another based on both their social
connections and the social activities that they have
engaged in the past. In this article, we develop an
innovative and high-performance social influence based
graph clustering framework with four unique features.
First, we explicitly distinguish social connection
based influence (self-influence) and social activity
based influence (co-influence). We compute the
self-influence similarity between two members based on
their social connections within a single collaboration
network, and compute the co-influence similarity by
taking into account not only the set of activities that
people participate but also the semantic association
between these activities. Second, we define the concept
of influence-based similarity by introducing a unified
influence-based similarity matrix that employs an
iterative weight update method to integrate
self-influence and co-influence similarities. Third, we
design a dynamic learning algorithm, called SI-C
luster, for social influence based graph clustering. It
iteratively partitions a large social collaboration
network into K clusters based on both the social
network itself and the multiple associated activity
information networks, each representing a category of
activities that people have engaged. To make the
SI-Cluster algorithm converge fast, we transform
sophisticated nonlinear fractional programming problem
with respect to multiple weights into a straightforward
nonlinear parametric programming problem of single
variable. Finally, we develop an optimization technique
of diagonalizable-matrix approximation to speed up the
computation of self-influence similarity and
co-influence similarities. Our SI-Cluster-Opt
significantly improves the efficiency of SI-Cluster on
large graphs while maintaining high quality of
clustering results. Extensive experimental evaluation
on three real-world graphs shows that, compared to
existing representative graph clustering algorithms,
our SI-Cluster-Opt approach not only achieves a very
good balance between self-influence and co-influence
similarities but also scales extremely well for
clustering large graphs in terms of time complexity
while meeting the guarantee of high density, low
entropy and low Davies--Bouldin Index.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Papalexakis:2015:PSP,
author = "Evangelos E. Papalexakis and Christos Faloutsos and
Nicholas D. Sidiropoulos",
title = "{ParCube}: Sparse Parallelizable {CANDECOMP--PARAFAC}
Tensor Decomposition",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "3:1--3:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2729980",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "How can we efficiently decompose a tensor into sparse
factors, when the data do not fit in memory? Tensor
decompositions have gained a steadily increasing
popularity in data-mining applications; however, the
current state-of-art decomposition algorithms operate
on main memory and do not scale to truly large
datasets. In this work, we propose ParCube, a new and
highly parallelizable method for speeding up tensor
decompositions that is well suited to produce sparse
approximations. Experiments with even moderately large
data indicate over 90\% sparser outputs and 14 times
faster execution, with approximation error close to the
current state of the art irrespective of computation
and memory requirements. We provide theoretical
guarantees for the algorithm's correctness and we
experimentally validate our claims through extensive
experiments, including four different real world
datasets (Enron, Lbnl, Facebook and Nell),
demonstrating its effectiveness for data-mining
practitioners. In particular, we are the first to
analyze the very large Nell dataset using a sparse
tensor decomposition, demonstrating that ParCube
enables us to handle effectively and efficiently very
large datasets. Finally, we make our highly scalable
parallel implementation publicly available, enabling
reproducibility of our work.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Ahmed:2015:AMC,
author = "Rezwan Ahmed and George Karypis",
title = "Algorithms for Mining the Coevolving Relational Motifs
in Dynamic Networks",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "4:1--4:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2733380",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Computational methods and tools that can efficiently
and effectively analyze the temporal changes in dynamic
complex relational networks enable us to gain
significant insights regarding the entity relations and
their evolution. This article introduces a new class of
dynamic graph patterns, referred to as coevolving
relational motifs (CRMs), which are designed to
identify recurring sets of entities whose relations
change in a consistent way over time. CRMs can provide
evidence to the existence of, possibly unknown,
coordination mechanisms by identifying the relational
motifs that evolve in a similar and highly conserved
fashion. We developed an algorithm to efficiently
analyze the frequent relational changes between the
entities of the dynamic networks and capture all
frequent coevolutions as CRMs. Our algorithm follows a
depth-first exploration of the frequent CRM lattice and
incorporates canonical labeling for redundancy
elimination. Experimental results based on multiple
real world dynamic networks show that the method is
able to efficiently identify CRMs. In addition, a
qualitative analysis of the results shows that the
discovered patterns can be used as features to
characterize the dynamic network.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Campello:2015:HDE,
author = "Ricardo J. G. B. Campello and Davoud Moulavi and
Arthur Zimek and J{\"o}rg Sander",
title = "Hierarchical Density Estimates for Data Clustering,
Visualization, and Outlier Detection",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "5:1--5:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2733381",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "An integrated framework for density-based cluster
analysis, outlier detection, and data visualization is
introduced in this article. The main module consists of
an algorithm to compute hierarchical estimates of the
level sets of a density, following Hartigan's classic
model of density-contour clusters and trees. Such an
algorithm generalizes and improves existing
density-based clustering techniques with respect to
different aspects. It provides as a result a complete
clustering hierarchy composed of all possible
density-based clusters following the nonparametric
model adopted, for an infinite range of density
thresholds. The resulting hierarchy can be easily
processed so as to provide multiple ways for data
visualization and exploration. It can also be further
postprocessed so that: (i) a normalized score of
``outlierness'' can be assigned to each data object,
which unifies both the global and local perspectives of
outliers into a single definition; and (ii) a ``flat''
(i.e., nonhierarchical) clustering solution composed of
clusters extracted from local cuts through the cluster
tree (possibly corresponding to different density
thresholds) can be obtained, either in an unsupervised
or in a semisupervised way. In the unsupervised
scenario, the algorithm corresponding to this
postprocessing module provides a global, optimal
solution to the formal problem of maximizing the
overall stability of the extracted clusters. If
partially labeled objects or instance-level constraints
are provided by the user, the algorithm can solve the
problem by considering both constraints
violations/satisfactions and cluster stability
criteria. An asymptotic complexity analysis, both in
terms of running time and memory space, is described.
Experiments are reported that involve a variety of
synthetic and real datasets, including comparisons with
state-of-the-art, density-based clustering and (global
and local) outlier detection methods.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Berardi:2015:UTR,
author = "Giacomo Berardi and Andrea Esuli and Fabrizio
Sebastiani",
title = "Utility-Theoretic Ranking for Semiautomated Text
Classification",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "6:1--6:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2742548",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Semiautomated Text Classification (SATC) may be
defined as the task of ranking a set D of automatically
labelled textual documents in such a way that, if a
human annotator validates (i.e., inspects and corrects
where appropriate) the documents in a top-ranked
portion of D with the goal of increasing the overall
labelling accuracy of D, the expected increase is
maximized. An obvious SATC strategy is to rank D so
that the documents that the classifier has labelled
with the lowest confidence are top ranked. In this
work, we show that this strategy is suboptimal. We
develop new utility-theoretic ranking methods based on
the notion of validation gain, defined as the
improvement in classification effectiveness that would
derive by validating a given automatically labelled
document. We also propose a new effectiveness measure
for SATC-oriented ranking methods, based on the
expected reduction in classification error brought
about by partially validating a list generated by a
given ranking method. We report the results of
experiments showing that, with respect to the baseline
method mentioned earlier, and according to the proposed
measure, our utility-theoretic ranking methods can
achieve substantially higher expected reductions in
classification error.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Yu:2015:DIP,
author = "Zhiwen Yu and Zhu Wang and Huilei He and Jilei Tian
and Xinjiang Lu and Bin Guo",
title = "Discovering Information Propagation Patterns in
Microblogging Services",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "7:1--7:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2742801",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "During the last decade, microblog has become an
important social networking service with billions of
users all over the world, acting as a novel and
efficient platform for the creation and dissemination
of real-time information. Modeling and revealing the
information propagation patterns in microblogging
services cannot only lead to more accurate
understanding of user behaviors and provide insights
into the underlying sociology, but also enable useful
applications such as trending prediction,
recommendation and filtering, spam detection and viral
marketing. In this article, we aim to reveal the
information propagation patterns in Sina Weibo, the
biggest microblogging service in China. First, the
cascade of each message is represented as a tree based
on its retweeting process. Afterwards, we divide the
information propagation pattern into two levels, that
is, the macro level and the micro level. On one hand,
the macro propagation patterns refer to general
propagation modes that are extracted by grouping
propagation trees based on hierarchical clustering. On
the other hand, the micro propagation patterns are
frequent information flow patterns that are discovered
using tree-based mining techniques. Experimental
results show that several interesting patterns are
extracted, such as popular message propagation,
artificial propagation, and typical information flows
between different types of users.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Zhang:2015:SMB,
author = "Xianchao Zhang and Xiaotong Zhang and Han Liu",
title = "Smart Multitask {Bregman} Clustering and Multitask
Kernel Clustering",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "8:1--8:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2747879",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Traditional clustering algorithms deal with a single
clustering task on a single dataset. However, there are
many related tasks in the real world, which motivates
multitask clustering. Recently some multitask
clustering algorithms have been proposed, and among
them multitask Bregman clustering (MBC) is a very
applicable method. MBC alternatively updates clusters
and learns relationships between clusters of different
tasks, and the two phases boost each other. However,
the boosting does not always have positive effects on
improving the clustering performance, it may also cause
negative effects. Another issue of MBC is that it
cannot deal with nonlinear separable data. In this
article, we show that in MBC, the process of using
cluster relationship to boost the cluster updating
phase may cause negative effects, that is, cluster
centroids may be skewed under some conditions. We
propose a smart multitask Bregman clustering (S-MBC)
algorithm which can identify the negative effects of
the boosting and avoid the negative effects if they
occur. We then propose a multitask kernel clustering
(MKC) framework for nonlinear separable data by using a
similar framework like MBC in the kernel space. We also
propose a specific optimization method, which is quite
different from that of MBC, to implement the MKC
framework. Since MKC can also cause negative effects
like MBC, we further extend the framework of MKC to a
smart multitask kernel clustering (S-MKC) framework in
a similar way that S-MBC is extended from MBC. We
conduct experiments on 10 real world multitask
clustering datasets to evaluate the performance of
S-MBC and S-MKC. The results on clustering accuracy
show that: (1) compared with the original MBC algorithm
MBC, S-MBC and S-MKC perform much better; (2) compared
with the convex discriminative multitask relationship
clustering (DMTRC) algorithms DMTRC-L and DMTRC-R which
also avoid negative transfer, S-MBC and S-MKC perform
worse in the (ideal) case in which different tasks have
the same cluster number and the empirical label
marginal distribution in each task distributes evenly,
but better or comparable in other (more general) cases.
Moreover, S-MBC and S-MKC can work on the datasets in
which different tasks have different number of
clusters, violating the assumptions of DMTRC-L and
DMTRC-R. The results on efficiency show that S-MBC and
S-MKC consume more computational time than MBC and less
computational time than DMTRC-L and DMTRC-R. Overall
S-MBC and S-MKC are competitive compared with the
state-of-the-art multitask clustering algorithms in
synthetical terms of accuracy, efficiency and
applicability.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Wei:2015:MTP,
author = "Wei Wei and Kathleen M. Carley",
title = "Measuring Temporal Patterns in Dynamic Social
Networks",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "9:1--9:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2749465",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "Given social networks over time, how can we measure
network activities across different timesteps with a
limited number of metrics? We propose two classes of
dynamic metrics for assessing temporal evolution
patterns of agents in terms of persistency and
emergence. For each class of dynamic metrics, we
implement it using three different temporal aggregation
models ranging from the most commonly used Average
Aggregation Model to more the complex models such as
the Exponential Aggregation Model. We argue that the
problem of measuring temporal patterns can be
formulated using Recency and Primacy effect, which is a
concept used to characterize human cognitive processes.
Experimental results show that the way metrics model
Recency--Primacy effect is closely related to their
abilities to measure temporal patterns. Furthermore,
our results indicate that future network agent
activities can be predicted based on history
information using dynamic metrics. By conducting
multiple experiments, we are also able to find an
optimal length of history information that is most
relevant to future activities. This optimal length is
highly consistent within a dataset and can be used as
an intrinsic metric to evaluate a dynamic social
network.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}
@Article{Liu:2015:RAT,
author = "Siyuan Liu and Qiang Qu and Shuhui Wang",
title = "Rationality Analytics from Trajectories",
journal = j-TKDD,
volume = "10",
number = "1",
pages = "10:1--10:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "http://dx.doi.org/10.1145/2735634",
ISSN = "1556-4681 (print), 1556-472X (electronic)",
ISSN-L = "1556-4681",
bibdate = "Tue Jul 28 17:19:31 MDT 2015",
bibsource = "http://www.acm.org/pubs/contents/journals/tkdd/;
http://www.math.utah.edu/pub/tex/bib/tkdd.bib",
abstract = "The availability of trajectories tracking the
geographical locations of people as a function of time
offers an opportunity to study human behaviors. In this
article, we study rationality from the perspective of
user decision on visiting a point of interest (POI)
which is represented as a trajectory. However, the
analysis of rationality is challenged by a number of
issues, for example, how to model a trajectory in terms
of complex user decision processes? and how to detect
hidden factors that have significant impact on the
rational decision making? In this study, we propose
Rationality Analysis Model (RAM) to analyze rationality
from trajectories in terms of a set of impact factors.
In order to automatically identify hidden factors, we
propose a method, Collective Hidden Factor Retrieval
(CHFR), which can also be generalized to parse multiple
trajectories at the same time or parse individual
trajectories of different time periods. Extensive
experimental study is conducted on three large-scale
real-life datasets (i.e., taxi trajectories, user
shopping trajectories, and visiting trajectories in a
theme park). The results show that the proposed methods
are efficient, effective, and scalable. We also deploy
a system in a large theme park to conduct a field
study. Interesting findings and user feedback of the
field study are provided to support other applications
in user behavior mining and analysis, such as business
intelligence and user management for marketing
purposes.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Knowledge Discovery from Data
(TKDD)",
journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054",
}