%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%% author = "Nelson H. F. Beebe",
%%% version = "1.12",
%%% date = "11 March 2019",
%%% time = "18:58:03 MST",
%%% filename = "topc.bib",
%%% address = "University of Utah
%%% Department of Mathematics, 110 LCB
%%% 155 S 1400 E RM 233
%%% Salt Lake City, UT 84112-0090
%%% USA",
%%% telephone = "+1 801 581 5254",
%%% FAX = "+1 801 581 4148",
%%% URL = "http://www.math.utah.edu/~beebe",
%%% checksum = "00952 5174 29538 275848",
%%% email = "beebe at math.utah.edu, beebe at acm.org,
%%% beebe at computer.org (Internet)",
%%% codetable = "ISO/ASCII",
%%% keywords = "ACM Transactions on Parallel Computing
%%% (TOPC); bibliography; BibTeX",
%%% license = "public domain",
%%% supported = "yes",
%%% docstring = "This is a COMPLETE BibTeX bibliography for
%%% ACM Transactions on Parallel Computing (TOPC)
%%% (CODEN ????, ISSN 2329-4949 (print),
%%% 2329-4957 (electronic)). The journal appears
%%% quarterly, and publication began with volume
%%% 1, number 1, in September 2014.
%%%
%%% At version 1.12, the COMPLETE journal
%%% coverage looked like this:
%%%
%%% 2014 ( 8) 2016 ( 25) 2018 ( 22)
%%% 2015 ( 29) 2017 ( 16) 2019 ( 4)
%%%
%%% Article: 104
%%%
%%% Total entries: 104
%%%
%%% The journal Web page can be found at:
%%%
%%% http://topc.acm.org/
%%%
%%% The journal table of contents page is at:
%%%
%%% http://dl.acm.org/pub.cfm?id=J1496
%%% http://dl.acm.org/citation.cfm?id=2632163
%%%
%%% Qualified subscribers can retrieve the full
%%% text of recent articles in PDF form.
%%%
%%% The initial draft was extracted from the ACM
%%% Web pages.
%%%
%%% ACM copyrights explicitly permit abstracting
%%% with credit, so article abstracts, keywords,
%%% and subject classifications have been
%%% included in this bibliography wherever
%%% available. Article reviews have been
%%% omitted, until their copyright status has
%%% been clarified.
%%%
%%% bibsource keys in the bibliography entries
%%% below indicate the entry originally came
%%% from the computer science bibliography
%%% archive, even though it has likely since
%%% been corrected and updated.
%%%
%%% URL keys in the bibliography point to
%%% World Wide Web locations of additional
%%% information about the entry.
%%%
%%% BibTeX citation tags are uniformly chosen
%%% as name:year:abbrev, where name is the
%%% family name of the first author or editor,
%%% year is a 4-digit number, and abbrev is a
%%% 3-letter condensation of important title
%%% words. Citation tags were automatically
%%% generated by software developed for the
%%% BibNet Project.
%%%
%%% In this bibliography, entries are sorted in
%%% publication order, using ``bibsort -byvolume.''
%%%
%%% The checksum field above contains a CRC-16
%%% checksum as the first value, followed by the
%%% equivalent of the standard UNIX wc (word
%%% count) utility output of lines, words, and
%%% characters. This is produced by Robert
%%% Solovay's checksum utility.",
%%% }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}
%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
%%% ====================================================================
%%% Journal abbreviations:
@String{j-TOPC = "ACM Transactions on Parallel Computing
(TOPC)"}
%%% ====================================================================
%%% Bibliography entries:
@Article{Gibbons:2014:ATP,
author = "Phillip B. Gibbons",
title = "{ACM Transactions on Parallel Computing}: an
introduction",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "1:1--1:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2661651",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Techniques",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Lilja:2014:I,
author = "David J. Lilja",
title = "Introduction",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "2:1--2:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2609798",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Rane:2014:EPO,
author = "Ashay Rane and James Browne",
title = "Enhancing Performance Optimization of Multicore\slash
Multichip Nodes with Data Structure Metrics",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "3:1--3:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2588788",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Program performance optimization is usually based
solely on measurements of execution behavior of code
segments using hardware performance counters. However,
memory access patterns are critical performance
limiting factors for today's multicore chips where
performance is highly memory bound. Therefore diagnoses
and selection of optimizations based only on
measurements of the execution behavior of code segments
are incomplete because they do not incorporate
knowledge of memory access patterns and behaviors. This
article presents a low-overhead tool (MACPO) that
captures memory traces and computes metrics for the
memory access behavior of source-level (C, C++,
Fortran) data structures. MACPO explicitly targets the
measurement and metrics important to performance
optimization for multicore chips. The article also
presents a complete process for integrating measurement
and analyses of code execution with measurements and
analyses of memory access patterns and behaviors for
performance optimization, specifically targeting
multicore chips and multichip nodes of clusters. MACPO
uses more realistic cache models for computation of
latency metrics than those used by previous tools.
Evaluation of the effectiveness of adding memory access
behavior characteristics of data structures to
performance optimization was done on subsets of the
ASCI, NAS and Rodinia parallel benchmarks and two
versions of one application program from a domain not
represented in these benchmarks. Adding characteristics
of the behavior of data structures enabled easier
diagnoses of bottlenecks and more accurate selection of
appropriate optimizations than with only code centric
behavior measurements. The performance gains ranged
from a few percent to 38 percent.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Jimenez:2014:APP,
author = "V{\'\i}ctor Jim{\'e}nez and Francisco J. Cazorla and
Roberto Gioiosa and Alper Buyuktosunoglu and Pradip
Bose and Francis P. O'Connell and Bruce G. Mealey",
title = "Adaptive Prefetching on {POWER7}: Improving
Performance and Power Consumption",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "4:1--4:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2588889",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Hardware data prefetch engines are integral parts of
many general purpose server-class microprocessors in
the field today. Some prefetch engines allow users to
change some of their parameters. But, the prefetcher is
usually enabled in a default configuration during
system bring-up, and dynamic reconfiguration of the
prefetch engine is not an autonomic feature of current
machines. Conceptually, however, it is easy to infer
that commonly used prefetch algorithms---when applied
in a fixed mode---will not help performance in many
cases. In fact, they may actually degrade performance
due to useless bus bandwidth consumption and cache
pollution, which in turn, will also waste power. We
present an adaptive prefetch scheme that dynamically
modifies the prefetch settings in order to adapt to
workloads' requirements. We use a commercial processor,
namely the IBM POWER7 as a vehicle for our study. First
we characterize---in terms of performance and power
consumption---the prefetcher in that processor using
microbenchmarks and SPEC CPU2006. We then present our
adaptive prefetch mechanism showing performance
improvements with respect to the default prefetch
setting up to 2.7X and 1.3X for single-threaded and
multiprogrammed workloads, respectively. Adaptive
prefetching is also able to reduce power consumption in
some cases. Finally, we also evaluate our mechanism
with SPECjbb2005, improving both performance and power
consumption.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Heil:2014:APH,
author = "Timothy Heil and Anil Krishna and Nicholas Lindberg
and Farnaz Toussi and Steven Vanderwiel",
title = "Architecture and Performance of the Hardware
Accelerators in {IBM}'s {PowerEN} Processor",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "5:1--5:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2588888",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Computation at the edge of a datacenter has unique
characteristics. It deals with streaming data from
multiple sources, going to multiple destinations, often
requiring repeated application of one or more of
several standard algorithmic kernels. These kernels,
related to encryption, compression, XML Parsing and
regular expression searching on the data, demand a high
data processing rate and power efficiency. This
suggests the use of hardware acceleration for key
functions. However, robust general purpose processing
support is necessary to orchestrate the flow of data
between accelerators, as well as perform tasks that are
not suited to acceleration. Further, these accelerators
must be tightly integrated with the general purpose
computation in order to keep invocation overhead and
latency low. The accelerators must be easy for software
to use, and the system must be flexible enough to
support evolving networking standards. In this article,
we describe and evaluate the architecture of IBM's
PowerEN processor, with a focus on PowerEN's
architectural enhancements and its on-chip hardware
accelerators. PowerEN unites the throughput of
application-specific accelerators with the
programmability of general purpose cores on a single
coherent memory architecture. Hardware acceleration
improves throughput by orders of magnitude in some
cases compared to equivalent computation on the general
purpose cores. By offloading work to the accelerators,
general purpose cores are freed to simultaneously work
on computation less suited to acceleration.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Wu:2014:MAG,
author = "Xing Wu and Frank Mueller and Scott Pakin",
title = "A methodology for automatic generation of executable
communication specifications from parallel {MPI}
applications",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "6:1--6:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2660249",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Portable parallel benchmarks are widely used for
performance evaluation of HPC systems. However, because
these are manually produced, they generally represent a
greatly simplified view of application behavior,
missing the subtle but important-to-performance nuances
that may exist in a complete application. This work
contributes novel methods to automatically generate
highly portable and customizable communication
benchmarks from HPC applications. We utilize
ScalaTrace, a lossless yet scalable
parallel-application tracing framework to collect
selected aspects of the run-time behavior of HPC
applications, including communication operations and
computation time, while abstracting away the details of
the computation proper. We subsequently generate
benchmarks with nearly identical run-time behavior to
the original applications. Results demonstrate that the
generated benchmarks are in fact able to preserve the
run-time behavior (including both the communication
pattern and the execution time) of the original
applications. Such automated benchmark generation is
without precedent and particularly valuable for
proprietary, export-controlled, or classified
application codes.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Ravishankar:2014:APC,
author = "Mahesh Ravishankar and John Eisenlohr and
Louis-No{\"e}l Pouchet and J. Ramanujam and Atanas
Rountev and P. Sadayappan",
title = "Automatic parallelization of a class of irregular
loops for distributed memory systems",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "7:1--7:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2660251",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Many scientific applications spend significant time
within loops that are parallel, except for dependences
from associative reduction operations. However these
loops often contain data-dependent control-flow and
array-access patterns. Traditional optimizations that
rely on purely static analysis fail to generate
parallel code in such cases. This article proposes an
approach for automatic parallelization for distributed
memory environments, using both static and runtime
analysis. We formalize the computations that are
targeted by this approach and develop algorithms to
detect such computations. We also describe algorithms
to generate a parallel inspector that performs a
runtime analysis of control-flow and array-access
patterns, and a parallel executor to take advantage of
this information. The effectiveness of the approach is
demonstrated on several benchmarks that were
automatically transformed using a prototype compiler.
For these, the inspector overheads and performance of
the executor code were measured. The benefit on
real-world applications was also demonstrated through
similar manual transformations of an atmospheric
modeling software.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Shun:2014:SPC,
author = "Julian Shun and Guy E. Blelloch",
title = "A simple parallel {Cartesian} tree algorithm and its
application to parallel suffix tree construction",
journal = j-TOPC,
volume = "1",
number = "1",
pages = "8:1--8:??",
month = sep,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2661653",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Oct 17 12:28:03 MDT 2014",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present a simple linear work and space, and
polylogarithmic time parallel algorithm for generating
multiway Cartesian trees. We show that bottom-up
traversals of the multiway Cartesian tree on the
interleaved suffix array and longest common prefix
array of a string can be used to answer certain string
queries. By adding downward pointers in the tree (e.g.
using a hash table), we can also generate suffix trees
from suffix arrays on arbitrary alphabets in the same
bounds. In conjunction with parallel suffix array
algorithms, such as the skew algorithm, this gives a
rather simple linear work parallel, $O(n \epsilon) $
time $ (0 < \epsilon < 1) $, algorithm for generating
suffix trees over an integer alphabet $ \Sigma \{ 1,
\ldots {}, n \} $, where $n$ is the length of the input
string. It also gives a linear work parallel algorithm
requiring $O(\log_2 n) $ time with high probability
for constant-sized alphabets. More generally, given a
sorted sequence of strings and the longest common
prefix lengths between adjacent elements, the algorithm
will generate a patricia tree (compacted trie) over the
strings. Of independent interest, we describe a
work-efficient parallel algorithm for solving the all
nearest smaller values problem using Cartesian trees,
which is much simpler than the work-efficient parallel
algorithm described in previous work. We also present
experimental results comparing the performance of the
algorithm to existing sequential implementations and a
second parallel algorithm that we implement. We present
comparisons for the Cartesian tree algorithm on its own
and for constructing a suffix tree. The results show
that on a variety of strings our algorithm is
competitive with the sequential version on a single
processor and achieves good speedup on multiple
processors. We present experiments for three
applications that require only the Cartesian tree, and
also for searching using the suffix tree.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Pingali:2015:ISI,
author = "Keshav Pingali and J. Ramanujam and P. Sadayappan",
title = "Introduction to the Special Issue on {PPoPP'12}",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "9:1--9:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2716343",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Bouteiller:2015:ABF,
author = "Aurelien Bouteiller and Thomas Herault and George
Bosilca and Peng Du and Jack Dongarra",
title = "Algorithm-Based Fault Tolerance for Dense Matrix
Factorizations, Multiple Failures and Accuracy",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "10:1--10:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686892",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Dense matrix factorizations, such as LU, Cholesky and
QR, are widely used for scientific applications that
require solving systems of linear equations,
eigenvalues and linear least squares problems. Such
computations are normally carried out on
supercomputers, whose ever-growing scale induces a fast
decline of the Mean Time To Failure (MTTF). This
article proposes a new hybrid approach, based on
Algorithm-Based Fault Tolerance (ABFT), to help matrix
factorizations algorithms survive fail-stop failures.
We consider extreme conditions, such as the absence of
any reliable node and the possibility of losing both
data and checksum from a single failure. We will
present a generic solution for protecting the right
factor, where the updates are applied, of all above
mentioned factorizations. For the left factor, where
the panel has been applied, we propose a scalable
checkpointing algorithm. This algorithm features high
degree of checkpointing parallelism and cooperatively
utilizes the checksum storage leftover from the right
factor protection. The fault-tolerant algorithms
derived from this hybrid solution is applicable to a
wide range of dense matrix factorizations, with minor
modifications. Theoretical analysis shows that the
fault tolerance overhead decreases inversely to the
scaling in the number of computing units and the
problem size. Experimental results of LU and QR
factorization on the Kraken (Cray XT5) supercomputer
validate the theoretical evaluation and confirm
negligible overhead, with- and without-errors.
Applicability to tolerate multiple failures and
accuracy after multiple recovery is also considered.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Ballard:2015:ACS,
author = "Grey Ballard and James Demmel and Nicholas Knight",
title = "Avoiding Communication in Successive Band Reduction",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "11:1--11:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686877",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "The running time of an algorithm depends on both
arithmetic and communication (i.e., data movement)
costs, and the relative costs of communication are
growing over time. In this work, we present sequential
and distributed-memory parallel algorithms for
tridiagonalizing full symmetric and symmetric band
matrices that asymptotically reduce communication
compared to previous approaches. The tridiagonalization
of a symmetric band matrix is a key kernel in solving
the symmetric eigenvalue problem for both full and band
matrices. In order to preserve structure,
tridiagonalization routines use annihilate-and-chase
procedures that previously have suffered from poor data
locality and high parallel latency cost. We improve
both by reorganizing the computation and obtain
asymptotic improvements. We also propose new algorithms
for reducing a full symmetric matrix to band form in a
communication-efficient manner. In this article, we
consider the cases of computing eigenvalues only and of
computing eigenvalues and all eigenvectors.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Sack:2015:CAM,
author = "Paul Sack and William Gropp",
title = "Collective Algorithms for Multiported Torus Networks",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "12:1--12:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686882",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Modern supercomputers with torus networks allow each
node to simultaneously pass messages on all of its
links. However, most collective algorithms are designed
to only use one link at a time. In this work, we
present novel multiported algorithms for the scatter,
gather, all-gather, and reduce-scatter operations. Our
algorithms can be combined to create multiported
reduce, all-reduce, and broadcast algorithms. Several
of these algorithms involve a new technique where we
relax the MPI message-ordering constraints to achieve
high performance and restore the correct ordering using
an additional stage of redundant communication.
According to our models, on an $n$-dimensional torus,
our algorithms should allow for nearly a $2 n$-fold
improvement in communication performance compared to
known, single-ported torus algorithms. In practice, we
have achieved nearly $6\times$ better performance on a
32k-node 3-dimensional torus.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Dice:2015:LCG,
author = "David Dice and Virendra J. Marathe and Nir Shavit",
title = "Lock Cohorting: a General Technique for Designing
{NUMA} Locks",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "13:1--13:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686884",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Multicore machines are quickly shifting to NUMA and
CC-NUMA architectures, making scalable NUMA-aware
locking algorithms, ones that take into account the
machine's nonuniform memory and caching hierarchy, ever
more important. This article presents lock cohorting, a
general new technique for designing NUMA-aware locks
that is as simple as it is powerful. Lock cohorting
allows one to transform any spin-lock algorithm, with
minimal nonintrusive changes,into a scalable NUMA-aware
spin-lock. Our new cohorting technique allows us to
easily create NUMA-aware versions of the TATAS-Backoff,
CLH, MCS, and ticket locks, to name a few. Moreover, it
allows us to derive a CLH-based cohort abortable lock,
the first NUMA-aware queue lock to support
abortability. We empirically compared the performance
of cohort locks with prior NUMA-aware and classic
NUMA-oblivious locks on a synthetic micro-benchmark, a
real world key-value store application memcached, as
well as the libc memory allocator. Our results
demonstrate that cohort locks perform as well or better
than known locks when the load is low and significantly
out-perform them as the load increases.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Merrill:2015:HPS,
author = "Duane Merrill and Michael Garland and Andrew
Grimshaw",
title = "High-Performance and Scalable {GPU} Graph Traversal",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "14:1--14:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2717511",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Breadth-First Search (BFS) is a core primitive for
graph traversal and a basis for many higher-level graph
analysis algorithms. It is also representative of a
class of parallel computations whose memory accesses
and work distribution are both irregular and data
dependent. Recent work has demonstrated the
plausibility of GPU sparse graph traversal, but has
tended to focus on asymptotically inefficient
algorithms that perform poorly on graphs with
nontrivial diameter. We present a BFS parallelization
focused on fine-grained task management constructed
from efficient prefix sum computations that achieves an
asymptotically optimal O(|V| + |E|) gd work complexity.
Our implementation delivers excellent performance on
diverse graphs, achieving traversal rates in excess of
3.3 billion and 8.3 billion traversed edges per second
using single- and quad-GPU configurations,
respectively. This level of performance is several
times faster than state-of-the-art implementations on
both CPU and GPU platforms.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Kramer:2015:SET,
author = "Stephan C. Kramer and Johannes Hagemann",
title = "{SciPAL}: Expression Templates and Composition Closure
Objects for High Performance Computational Physics with
{CUDA} and {OpenMP}",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "15:1--15:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686886",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present SciPAL (scientific parallel algorithms
library), a C ++-based, hardware-independent
open-source library. Its core is a domain-specific
embedded language for numerical linear algebra. The
main fields of application are finite element
simulations, coherent optics and the solution of
inverse problems. Using SciPAL algorithms can be stated
in a mathematically intuitive way in terms of matrix
and vector operations. Existing algorithms can easily
be adapted to GPU-based computing by proper template
specialization. Our library is compatible with the
finite element library deal .II and provides a port of
deal.II's most frequently used linear algebra classes
to CUDA (NVidia's extension of the programming
languages C and C ++ for programming their GPUs).
SciPAL 's operator-based API for BLAS operations
particularly aims at simplifying the usage of NVidia's
CUBLAS. For non-BLAS array arithmetic SciPAL 's
expression templates are able to generate CUDA kernels
at compile time. We demonstrate the benefits of SciPAL
using the iterative principal component analysis as
example which is the core algorithm for the
spike-sorting problem in neuroscience.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Totoni:2015:PME,
author = "Ehsan Totoni and Nikhil Jain and Laxmikant V. Kale",
title = "Power Management of Extreme-Scale Networks with
On\slash Off Links in Runtime Systems",
journal = j-TOPC,
volume = "1",
number = "2",
pages = "16:1--16:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687001",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Feb 18 16:46:00 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Networks are among major power consumers in
large-scale parallel systems. During execution of
common parallel applications, a sizeable fraction of
the links in the high-radix interconnects are either
never used or are underutilized. We propose a runtime
system based adaptive approach to turn off unused
links, which has various advantages over the previously
proposed hardware and compiler based approaches. We
discuss why the runtime system is the best system
component to accomplish this task, and test the
effectiveness of our approach using real applications
(including NAMD, MILC), and application benchmarks
(including NAS Parallel Benchmarks, Stencil). These
codes are simulated on representative topologies such
as 6-D Torus and multilevel directly connected network
(similar to IBM PERCS in Power 775 and Dragonfly in
Cray Aries). For common applications with near-neighbor
communication pattern, our approach can save up to 20\%
of total machine's power and energy, without any
performance penalty.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Herlihy:2015:GEI,
author = "Maurice Herlihy",
title = "{Guest Editor} Introduction",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "1:1--1:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2716306",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Degener:2015:LCS,
author = "Bastian Degener and Barbara Kempkes and Peter Kling
and Friedhelm {Meyer Auf Der Heide}",
title = "Linear and Competitive Strategies for Continuous Robot
Formation Problems",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "2:1--2:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742341",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We study a scenario in which n mobile robots with a
limited viewing range are distributed in the Euclidean
plane and have to solve a formation problem. The
formation problems we consider are the Gathering
problem and the Chain-Formation problem. In the
Gathering problem, the robots have to gather in one
(not predefined) point, while in the Chain-Formation
problem they have to form a connected communication
chain of minimal length between two stationary base
stations. Each robot may base its decisions where to
move only on the current relative positions of
neighboring robots (that are within its viewing range);
that is, besides having a limited viewing range, the
robots are oblivious (they do not use information from
the past), have none or only very limited identities,
and they do not have a common sense of direction.
Variants of these problems (especially for the
Gathering problem) have been studied extensively in
different discrete time models. In contrast, our work
focuses on a continuous time model; that is, the robots
continuously sense the positions of other robots within
their viewing range and continuously adapt their speed
and direction according to some simple, local rules.
Hereby, we assume that the robots have a maximum
movement speed of one. We show that this idealized idea
of continuous sensing allows us to solve the mentioned
formation problems in linear time $ O(n) $ (which,
given the maximum speed of one, immediately yields a
maximum traveled distance of $ O(n)$). Note that in the
more classical discrete time models, the best known
strategies need at least $ \Theta (n^2)$ or even $
\Theta (n^2 \log n)$ timesteps to solve these problems.
For the Gathering problem, our analysis solves a
problem left open by Gordon et al. [2004], where the
authors could prove that gathering in a continuous
model is possible in finite time, but were not able to
give runtime bounds. Apart from these linear bounds, we
also provide runtime bounds for both formation problems
that relate the runtime of our strategies to the
runtime of an optimal, global algorithm. Specifically,
we show that our strategy for the Gathering problem is
log OPT-competitive and the strategy for the
Chain-Formation problem is $ \log n$ competitive. Here,
by $c$-competitive, we mean that our (local) strategy
is asymptotically by at most a factor of $c$ slower
than an optimal, global strategy.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Jain:2015:NOS,
author = "Navendu Jain and Ishai Menache and Joseph (Seffi) Naor
and Jonathan Yaniv",
title = "Near-Optimal Scheduling Mechanisms for
Deadline-Sensitive Jobs in Large Computing Clusters",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "3:1--3:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742343",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We consider a market-based resource allocation model
for batch jobs in cloud computing clusters. In our
model, we incorporate the importance of the due date of
a job rather than the number of servers allocated to it
at any given time. Each batch job is characterized by
the work volume of total computing units (e.g., CPU
hours) along with a bound on maximum degree of
parallelism. Users specify, along with these job
characteristics, their desired due date and a value for
finishing the job by its deadline. Given this
specification, the primary goal is to determine the
scheduling of cloud computing instances under capacity
constraints in order to maximize the social welfare
(i.e., sum of values gained by allocated users). Our
main result is a new (CC-kcss-1)-approximation
algorithm for this objective, where $C$ denotes cloud
capacity, $k$ is the maximal bound on parallelized
execution (in practical settings, $ k < C$) and $s$ is
the slackness on the job completion time, that is, the
minimal ratio between a specified deadline and the
earliest finish time of a job. Our algorithm is based
on utilizing dual fitting arguments over a strengthened
linear program to the problem. Based on the new
approximation algorithm, we construct truthful
allocation and pricing mechanisms, in which reporting
the true value and other properties of the job
(deadline, work volume, and the parallelism bound) is a
dominant strategy for all users. To that end, we extend
known results for single-value settings to provide a
general framework for transforming allocation
algorithms into truthful mechanisms in domains of
single-value and multi-properties. We then show that
the basic mechanism can be extended under proper
Bayesian assumptions to the objective of maximizing
revenues, which is important for public clouds. We
empirically evaluate the benefits of our approach
through simulations on data-center job traces, and show
that the revenues obtained under our mechanism are
comparable with an ideal fixed-price mechanism, which
sets an on-demand price using oracle knowledge of
users' valuations. Finally, we discuss how our model
can be extended to accommodate uncertainties in job
work volumes, which is a practical challenge in cloud
settings.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Feldman:2015:HCG,
author = "Moran Feldman and Liane Lewin-Eytan and Joseph (Seffi)
Naor",
title = "Hedonic Clustering Games",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "4:1--4:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742345",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Clustering, the partitioning of objects with respect
to a similarity measure, has been extensively studied
as a global optimization problem. We investigate
clustering from a game-theoretic approach, and consider
the class of hedonic clustering games. Here, a
self-organized clustering is obtained via decisions
made by independent players, corresponding to the
elements clustered. Being a hedonic setting, the
utility of each player is determined by the identity of
the other members of her cluster. This class of games
seems to be quite robust, as it fits with rather
different, yet commonly used, clustering criteria.
Specifically, we investigate hedonic clustering games
in two different models: fixed clustering, which
subdivides into $k$-median and $k$-center, and
correlation clustering. We provide a thorough analysis
of these games, characterizing Nash equilibria, and
proving upper and lower bounds on the price of anarchy
and price of stability. For fixed clustering we focus
on the existence of a Nash equilibrium, as it is a
rather nontrivial issue in this setting. We study it
both for general metrics and special cases, such as
line and tree metrics. In the correlation clustering
model, we study both minimization and maximization
variants, and provide almost tight bounds on both the
price of anarchy and price of stability.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Jahn:2015:RRA,
author = "Janmartin Jahn and Santiago Pagani and Sebastian Kobbe
and Jian-Jia Chen and J{\"o}rg Henkel",
title = "Runtime Resource Allocation for Software Pipelines",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "5:1--5:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742347",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Efficiently allocating the computational resources of
many-core systems is one of the most prominent
challenges, especially when resource requirements may
vary unpredictably at runtime. This is even more
challenging when facing unreliable cores --- a scenario
that becomes common as the number of cores increases
and integration sizes shrink. To address this
challenge, this article presents an optimal method for
the allocation of the resources to software-pipelined
applications. Here we show how runtime observations of
the resource requirements of tasks can be used to adapt
resource allocations. Furthermore, we show how the
optimum can be traded for a high degree of scalability
by clustering applications in a distributed,
hierarchical manner. To diminish the negative effects
of unreliable cores, this article shows how
self-organization can effectively restore the integrity
of such a hierarchy when it is corrupted by a failing
core. Experiments on Intel's 48-core Single-Chip Cloud
Computer and in a many-core simulator show that a
significant improvement in system throughput can be
achieved over the current state of the art.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Xu:2015:SVC,
author = "Yi Xu and Bo Zhao and Youtao Zhang and Jun Yang",
title = "Simple Virtual Channel Allocation for High-Throughput
and High-Frequency On-Chip Routers",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "6:1--6:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742349",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Packet-switched network-on-chip (NoC) has provided a
scalable solution to the communications for tiled
multicore processors. However, the virtual channel (VC)
buffers in the NoC consume significant dynamic and
leakage power. To improve the energy efficiency of the
router design, it is advantageous to use small buffer
sizes while still maintaining throughput of the
network. This article proposes two new virtual channel
allocation (VA) mechanisms, termed fixed VC assignment
with dynamic VC allocation (FVADA) and adjustable VC
assignment with dynamic VC allocation (AVADA). VCs are
designated to output ports and allocated to packets
according to such assignment. This can help to reduce
the head-of-line blocking. Such VC-output port
assignment can also be adjusted dynamically to
accommodate traffic changes. Simulation results show
that both mechanisms can improve network throughput by
41\% on average. Real traffic evaluation shows a
network latency reduction of up to 66\%. In addition,
AVADA can outperform the baseline in throughput with
only half of the buffer size. Finally, we are able to
achieve comparable or better throughput than a previous
dynamic VC allocator while reducing its critical path
delay by 57\%. Hence, the proposed VA mechanisms are
suitable for low-power, high-throughput, and
high-frequency NoC designs.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Hammouda:2015:NTE,
author = "Adam Hammouda and Andrew R. Siegel and Stephen F.
Siegel",
title = "Noise-Tolerant Explicit Stencil Computations for
Nonuniform Process Execution Rates",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "7:1--7:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742351",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Next-generation HPC computing platforms are likely to
be characterized by significant, unpredictable
nonuniformities in execution time among compute nodes
and cores. The resulting load imbalances from this
nonuniformity are expected to arise from a variety of
sources-manufacturing discrepancies, dynamic power
management, runtime component failure, OS jitter,
software-mediated resiliency, and TLB/- cache
performance variations, for example. It is well
understood that existing algorithms with frequent
points of bulk synchronization will perform relatively
poorly in the presence of these sources of process
nonuniformity. Thus, recasting classic bulk synchronous
algorithms into more asynchronous, coarse-grained
parallelism is a critical area of research for
next-generation computing. We propose a class of
parallel algorithms for explicit stencil computations
that can tolerate these nonuniformities by decoupling
per process communication and computation in order for
each process to progress asynchronously while
maintaining solution correctness. These algorithms are
benchmarked with a $1$D domain decomposed (``slabbed'')
implementation of the $2$D heat equation as a model
problem, and are tested in the presence of simulated
nonuniform process execution rates. The resulting
performance is compared to a classic bulk synchronous
implementation of the model problem. Results show that
the runtime of this article's algorithm on a machine
with simulated process nonuniformities is 5--99\%
slower than the runtime of its classic counterpart on a
machine free of nonuniformities. However, when both
algorithms are run on a machine with comparable
synthetic process nonuniformities, this article's
algorithm is 1--37 times faster than its classic
counterpart.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{McCreesh:2015:SST,
author = "Ciaran McCreesh and Patrick Prosser",
title = "The Shape of the Search Tree for the Maximum Clique
Problem and the Implications for Parallel Branch and
Bound",
journal = j-TOPC,
volume = "2",
number = "1",
pages = "8:1--8:??",
month = may,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2742359",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Thu May 21 16:27:00 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Finding a maximum clique in a given graph is one of
the fundamental NP-hard problems. We compare two
multicore thread-parallel adaptations of a
state-of-the-art branch-and-bound algorithm for the
maximum clique problem and provide a novel explanation
as to why they are successful. We show that load
balance is sometimes a problem but that the interaction
of parallel search order and the most likely location
of solutions within the search space is often the
dominating consideration. We use this explanation to
propose a new low-overhead, scalable work-splitting
mechanism. Our approach uses explicit early diversity
to avoid strong commitment to the weakest heuristic
advice and late resplitting for balance. More
generally, we argue that, for branch-and-bound,
parallel algorithm design should not be performed
independently of the underlying sequential algorithm.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Hoefler:2015:RMA,
author = "Torsten Hoefler and James Dinan and Rajeev Thakur and
Brian Barrett and Pavan Balaji and William Gropp and
Keith Underwood",
title = "Remote Memory Access Programming in {MPI-3}",
journal = j-TOPC,
volume = "2",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2780584",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Aug 7 10:22:35 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/pvm.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "The Message Passing Interface (MPI) 3.0 standard,
introduced in September 2012, includes a significant
update to the one-sided communication interface, also
known as remote memory access (RMA). In particular, the
interface has been extended to better support popular
one-sided and global-address-space parallel programming
models to provide better access to hardware performance
features and enable new data-access modes. We present
the new RMA interface and specify formal axiomatic
models for data consistency and access semantics. Such
models can help users reason about details of the
semantics that are hard to extract from the English
prose in the standard. It also fosters the development
of tools and compilers, enabling them to automatically
analyze, optimize, and debug RMA programs.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Maldonado:2015:STB,
author = "Walther Maldonado and Patrick Marlier and Pascal
Felber and Julia Lawall and Gilles Muller and Etienne
Rivi{\`e}re",
title = "Supporting Time-Based {QoS} Requirements in Software
Transactional Memory",
journal = j-TOPC,
volume = "2",
number = "2",
pages = "10:1--10:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2779621",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Aug 7 10:22:35 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Software transactional memory (STM) is an optimistic
concurrency control mechanism that simplifies parallel
programming. However, there has been little interest in
its applicability to reactive applications in which
there is a required response time for certain
operations. We propose supporting such applications by
allowing programmers to associate time with atomic
blocks in the form of deadlines and quality-of-service
(QoS) requirements. Based on statistics of past
executions, we adjust the execution mode of
transactions by decreasing the level of optimism as the
deadline approaches. In the presence of concurrent
deadlines, we propose different conflict resolution
policies. Execution mode switching mechanisms allow the
meeting of multiple deadlines in a consistent manner,
with potential QoS degradations being split fairly
among several threads as contention increases, and
avoiding starvation. Our implementation consists of
extensions to an STM runtime that allow gathering
statistics and switching execution modes. We also
propose novel contention managers adapted to
transactional workloads subject to deadlines. The
experimental evaluation shows that our approaches
significantly improve the likelihood of a transaction
meeting its deadline and QoS requirement, even in cases
where progress is hampered by conflicts and other
concurrent transactions with deadlines.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Kestor:2015:TPD,
author = "Gokcen Kestor and Osman S. Unsal and Adrian Cristal
and Serdar Tasiran",
title = "{TRADE}: Precise Dynamic Race Detection for Scalable
Transactional Memory Systems",
journal = j-TOPC,
volume = "2",
number = "2",
pages = "11:1--11:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2786021",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Aug 7 10:22:35 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "As other multithreaded programs, transactional memory
(TM) programs are prone to race conditions. Previous
work focuses on extending existing definitions of data
race for lock-based applications to TM applications,
which requires all transactions to be totally ordered
``as if'' serialized by a global lock. This approach
poses implementation constraints on the STM that
severely limits TM applications' performance. This
article shows that forcing total ordering among all
running transactions, while sufficient, is not
necessary. We introduce an alternative data race
definition, relaxed transactional data race, that
requires ordering of only conflicting transactions. The
advantages of our relaxed definition are twofold:
First, unlike the previous definition, this definition
can be applied to a wide range of TMs, including those
that do not enforce transaction total ordering. Second,
within a single execution, it exposes a higher number
of data races, which considerably reduces debugging
time. Based on this definition, we propose a novel and
precise race detection tool for C/C++ TM applications
(TRADE), which detects data races by tracking
happens-before edges among conflicting transactions.
Our experiments reveal that TRADE precisely detects
data races for STAMP applications running on modern
STMs with overhead comparable to state-of-the-art race
detectors for lock-based applications. Our experiments
also show that in a single run, TRADE identifies
several races not discovered by 10 separate runs of a
race detection tool based on the previous data race
definition.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Diegues:2015:TWE,
author = "Nuno Diegues and Paolo Romano",
title = "{Time-Warp}: Efficient Abort Reduction in
Transactional Memory",
journal = j-TOPC,
volume = "2",
number = "2",
pages = "12:1--12:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2775435",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Aug 7 10:22:35 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "The multicore revolution that took place one decade
ago has turned parallel programming into a major
concern for the mainstream software development
industry. In this context, Transactional Memory (TM)
has emerged as a simpler and attractive alternative to
that of lock-based synchronization, whose complexity
and error-proneness are widely recognized. The notion
of permissiveness in TM translates to only aborting a
transaction when it cannot be accepted in any history
that guarantees a target correctness criterion. This
theoretically powerful property is often neglected by
state-of-the-art TMs because it imposes considerable
algorithmic costs. Instead, these TMs opt to maximize
their implementation's efficiency by aborting
transactions under overly conservative conditions. As a
result, they risk rejecting a significant number of
safe executions. In this article, we seek to identify a
sweet spot between permissiveness and efficiency by
introducing the Time-Warp Multiversion (TWM) algorithm.
TWM is based on the key idea of allowing an update
transaction that has performed stale reads (i.e.,
missed the writes of concurrently committed
transactions) to be serialized by ``committing it in
the past,'' which we call a time-warp commit. At its
core, TWM uses a novel, lightweight validation
mechanism with little computational overhead. TWM also
guarantees that read-only transactions can never be
aborted. Further, TWM guarantees Virtual World
Consistency, a safety property that is deemed as
particularly relevant in the context of TM. We
demonstrate the practicality of this approach through
an extensive experimental study: we compare TWM with
five other TMs, representative of typical alternative
design choices, and on a wide variety of benchmarks.
This study shows an average performance improvement
across all considered workloads and TMs of 65\% in high
concurrency scenarios, with gains extending up to $9
\times $ with the most favorable benchmarks. These
results are a consequence of TWM's ability to achieve
drastic reduction of aborts in scenarios of nonminimal
contention, while introducing little overhead
(approximately 10\%) in worst-case, synthetically
designed scenarios (i.e., no contention or contention
patterns that cannot be optimized using TWM).",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Eyraud-Dubois:2015:PST,
author = "Lionel Eyraud-Dubois and Loris Marchal and Oliver
Sinnen and Fr{\'e}d{\'e}ric Vivien",
title = "Parallel Scheduling of Task Trees with Limited
Memory",
journal = j-TOPC,
volume = "2",
number = "2",
pages = "13:1--13:??",
month = jul,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2779052",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Aug 7 10:22:35 MDT 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "This article investigates the execution of tree-shaped
task graphs using multiple processors. Each edge of
such a tree represents some large data. A task can only
be executed if all input and output data fit into
memory, and a data can only be removed from memory
after the completion of the task that uses it as an
input data. Such trees arise in the multifrontal method
of sparse matrix factorization. The peak memory needed
for the processing of the entire tree depends on the
execution order of the tasks. With one processor, the
objective of the tree traversal is to minimize the
required memory. This problem was well studied, and
optimal polynomial algorithms were proposed. Here, we
extend the problem by considering multiple processors,
which is of obvious interest in the application area of
matrix factorization. With multiple processors comes
the additional objective to minimize the time needed to
traverse the tree-that is, to minimize the makespan.
Not surprisingly, this problem proves to be much harder
than the sequential one. We study the computational
complexity of this problem and provide
inapproximability results even for unit weight trees.
We design a series of practical heuristics achieving
different trade-offs between the minimization of peak
memory usage and makespan. Some of these heuristics are
able to process a tree while keeping the memory usage
under a given memory limit. The different heuristics
are evaluated in an extensive experimental evaluation
using realistic trees.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Dinitz:2015:ISI,
author = "Michael Dinitz and Torsten Hoefler",
title = "Introduction to the Special Issue on {SPAA 2013}",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809923",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "14e",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Kumar:2015:FGA,
author = "Ravi Kumar and Benjamin Moseley and Sergei
Vassilvitskii and Andrea Vattani",
title = "Fast Greedy Algorithms in {MapReduce} and Streaming",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809814",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Greedy algorithms are practitioners' best
friends---they are intuitive, are simple to implement,
and often lead to very good solutions. However,
implementing greedy algorithms in a distributed setting
is challenging since the greedy choice is inherently
sequential, and it is not clear how to take advantage
of the extra processing power. Our main result is a
powerful sampling technique that aids in
parallelization of sequential algorithms. Armed with
this primitive, we then adapt a broad class of greedy
algorithms to the MapReduce paradigm; this class
includes maximum cover and submodular maximization
subject to p -system constraint problems. Our method
yields efficient algorithms that run in a logarithmic
number of rounds while obtaining solutions that are
arbitrarily close to those produced by the standard
sequential greedy algorithm. We begin with algorithms
for modular maximization subject to a matroid
constraint and then extend this approach to obtain
approximation algorithms for submodular maximization
subject to knapsack or p -system constraints.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Sanders:2015:WEM,
author = "Peter Sanders and Jochen Speck and Raoul Steffen",
title = "Work-Efficient Matrix Inversion in Polylogarithmic
Time",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "15:1--15:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809812",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present an algorithm for inversion of symmetric
positive definite matrices that combines the practical
requirement of an optimal number of arithmetic
operations and the theoretical goal of a
polylogarithmic critical path length. The algorithm
reduces inversion to matrix multiplication. It uses
Strassen's recursion scheme, but on the critical path
it breaks the recursion early, switching to an
asymptotically inefficient yet fast use of Newton's
method. We also show that the algorithm is numerically
stable. Overall, we get a candidate for a massively
parallel algorithm that scales to exascale systems even
on relatively small inputs.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Gilbert:2015:SBO,
author = "Seth Gilbert and Chaodong Zheng",
title = "{SybilCast}: Broadcast on the Open Airwaves",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "16:1--16:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809810",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Consider a scenario where many wireless users are
attempting to download data from a single base station.
While most of the users are honest, some users may be
malicious and attempt to obtain more than their fair
share of the bandwidth. One possible strategy for
attacking the system is to simulate multiple fake
identities, each of which is given its own equal share
of the bandwidth. Such an attack is often referred to
as a sybil attack. To counter such behavior, we propose
SybilCast, a protocol for multichannel wireless
networks that limits the number of fake identities and,
in doing so, ensures that each honest user gets at
least a constant fraction of his or her fair share of
the bandwidth. As a result, each honest user can
complete his or her data download in asymptotically
optimal time. A key aspect of this protocol is
balancing the rate at which new identities are admitted
and the maximum number of fake identities that can
coexist while keeping the overhead low. Besides sybil
attacks, our protocol can also tolerate spoofing and
jamming.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Lee:2015:FPP,
author = "I-Ting Angelina Lee and Charles E. Leiserson and Tao
B. Schardl and Zhunping Zhang and Jim Sukha",
title = "On-the-Fly Pipeline Parallelism",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "17:1--17:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809808",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Pipeline parallelism organizes a parallel program as a
linear sequence of stages. Each stage processes
elements of a data stream, passing each processed data
element to the next stage, and then taking on a new
element before the subsequent stages have necessarily
completed their processing. Pipeline parallelism is
used especially in streaming applications that perform
video, audio, and digital signal processing. Three out
of 13 benchmarks in PARSEC, a popular software
benchmark suite designed for shared-memory
multiprocessors, can be expressed as pipeline
parallelism. Whereas most concurrency platforms that
support pipeline parallelism use a
``construct-and-run'' approach, this article
investigates ``on-the-fly'' pipeline parallelism, where
the structure of the pipeline emerges as the program
executes rather than being specified a priori.
On-the-fly pipeline parallelism allows the number of
stages to vary from iteration to iteration and
dependencies to be data dependent. We propose simple
linguistics for specifying on-the-fly pipeline
parallelism and describe a provably efficient
scheduling algorithm, the P iper algorithm, which
integrates pipeline parallelism into a work-stealing
scheduler, allowing pipeline and fork-join parallelism
to be arbitrarily nested. The Piper algorithm
automatically throttles the parallelism, precluding
``runaway'' pipelines. Given a pipeline computation
with $ T_1 $ work and $ T_\infty $ span (critical-path
length), Piper executes the computation on $P$
processors in $ T_P \leq T_1 / P + O(T \infty + \lg P)$
expected time. Piper also limits stack space, ensuring
that it does not grow unboundedly with running time. We
have incorporated on-the-fly pipeline parallelism into
a Cilk-based work-stealing runtime system. Our
prototype Cilk-P implementation exploits optimizations
such as ``lazy enabling'' and ``dependency folding.''
We have ported the three PARSEC benchmarks that exhibit
pipeline parallelism to run on Cilk-P. One of these,
x264, cannot readily be executed by systems that
support only construct-and-run pipeline parallelism.
Benchmark results indicate that Cilk-P has low serial
overhead and good scalability. On x264, for example,
Cilk-P exhibits a speedup of 13.87 over its respective
serial counterpart when running on 16 processors.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Eikel:2015:IRI,
author = "Martina Eikel and Christian Scheideler",
title = "{IRIS}: a Robust Information System Against Insider
{DoS} Attacks",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "18:1--18:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809806",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "In this work, we present the first scalable
distributed information system, that is, a system with
low storage overhead, that is provably robust against
denial-of-service (DoS) attacks by a current insider.
We allow a current insider to have complete knowledge
about the information system and to have the power to
block any \varepsilon-fraction of its servers by a DoS
attack, where \varepsilon can be chosen up to a
constant. The task of the system is to serve any
collection of lookup requests with at most one per
nonblocked server in an efficient way despite this
attack. Previously, scalable solutions were only known
for DoS attacks of past insiders, where a past insider
only has complete knowledge about some past time point
$ t_0 $ of the information system. Scheideler et al.
[Awerbuch and Scheideler 2007; Baumgart et al. 2009]
showed that in this case, it is possible to design an
information system so that any information that was
inserted or last updated after $ t_0 $ is safe against
a DoS attack. But their constructions would not work at
all for a current insider. The key idea behind our IRIS
system is to make extensive use of coding. More
precisely, we present two alternative distributed
coding strategies with an at most logarithmic storage
overhead that can handle up to a constant fraction of
blocked servers.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Kling:2015:PSM,
author = "Peter Kling and Peter Pietrzyk",
title = "Profitable Scheduling on Multiple Speed-Scalable
Processors",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "19:1--19:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2809872",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present a new online algorithm for profit-oriented
scheduling on multiple speed-scalable processors and
provide a tight analysis of the algorithm's
competitiveness. Our results generalize and improve
upon work by Chan et al. [2010], which considers a
single speed-scalable processor. Using significantly
different techniques, we can not only extend their
model to multiprocessors but also prove an enhanced and
tight competitive ratio for our algorithm. In our
scheduling problem, jobs arrive over time and are
preemptable. They have different workloads, values, and
deadlines. The scheduler may decide not to finish a job
but instead to suffer a loss equaling the job's value.
However, to process a job's workload until its deadline
the scheduler must invest a certain amount of energy.
The cost of a schedule is the sum of lost values and
invested energy. In order to finish a job, the
scheduler has to determine which processors to use and
set their speeds accordingly. A processor's energy
consumption is power $ P_\alpha (s) $ integrated over
time, where $ P_\alpha (s) = s^\alpha $ is the power
consumption when running at speed $s$. Since we
consider the online variant of the problem, the
scheduler has no knowledge about future jobs. This
problem was introduced by Chan et al. [2010] for the
case of a single processor. They presented an online
algorithm that is $ \alpha^\alpha + 2 e \alpha
$-competitive. We provide an online algorithm for the
case of multiple processors with an improved
competitive ratio of $ \alpha^\alpha $.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Dutta:2015:CBR,
author = "Chinmoy Dutta and Gopal Pandurangan and Rajmohan
Rajaraman and Scott Roche",
title = "Coalescing-Branching Random Walks on Graphs",
journal = j-TOPC,
volume = "2",
number = "3",
pages = "20:1--20:??",
month = oct,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2817830",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Nov 3 07:30:42 MST 2015",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We study a distributed randomized information
propagation mechanism in networks we call the
coalescing-branching random walk (cobra walk, for
short). A cobra walk is a generalization of the
well-studied ``standard'' random walk, and is useful in
modeling and understanding the Susceptible-Infected-
Susceptible (SIS)-type of epidemic processes in
networks. It can also be helpful in performing
light-weight information dissemination in
resource-constrained networks. A cobra walk is
parameterized by a branching factor $k$. The process
starts from an arbitrary vertex, which is labeled
active for step 1. In each step of a cobra walk, each
active vertex chooses $k$ random neighbors to become
active for the next step (``branching''). A vertex is
active for step $ t + 1$ only if it is chosen by an
active vertex in step $t$ (``coalescing''). This
results in a stochastic process in the underlying
network with properties that are quite different from
both the standard random walk (which is equivalent to
the cobra walk with branching factor 1) as well as
other gossip-based rumor spreading mechanisms. We focus
on the cover time of the cobra walk, which is the
number of steps for the walk to reach all the vertices,
and derive almost-tight bounds for various graph
classes. We show an $ O(\log^2 n)$ high probability
bound for the cover time of cobra walks on expanders,
if either the expansion factor or the branching factor
is sufficiently large; we also obtain an $ O(\log n)$
high probability bound for the partial cover time,
which is the number of steps needed for the walk to
reach at least a constant fraction of the vertices. We
also show that the cover time of the cobra walk is,
with high probability, $ O(n \log n)$ on any $n$-vertex
tree for $ k \geq 2$, $ {\~ O}(n^{1 / d})$ on a
$d$-dimensional grid for $ k \geq 2$, and $ O(\log n)$
on the complete graph.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Larus:2016:ISI,
author = "James Larus and Sandhya Dwarkadas and Jos{\'e} Moreira
and Andrew Lumsdaine",
title = "Introduction to the Special Issue on {PPoPP'14}",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "21:1--21:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2856513",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{Herlihy:2016:WSF,
author = "Maurice Herlihy and Zhiyu Liu",
title = "Well-Structured Futures and Cache Locality",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "22:1--22:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2858650",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "In fork-join parallelism, a sequential program is
split into a directed acyclic graph of tasks linked by
directed dependency edges, and the tasks are executed,
possibly in parallel, in an order consistent with their
dependencies. A popular and effective way to extend
fork-join parallelism is to allow threads to create
futures. A thread creates a future to hold the results
of a computation, which may or may not be executed in
parallel. That result is returned when some thread
touches that future, blocking if necessary until the
result is ready. Recent research has shown that
although futures can, of course, enhance parallelism in
a structured way, they can have a deleterious effect on
cache locality. In the worst case, futures can incur $
\Omega (P T_\infty + t T_\infty) $ deviations, which
implies $ \Omega (C P T_\infty + C t T_\infty) $
additional cache misses, where $C$ is the number of
cache lines, $P$ is the number of processors, $t$ is
the number of touches, and $ T_\infty $ is the
computation span. Since cache locality has a large
impact on software performance on modern multicores,
this result is troubling. In this article, we show that
if futures are used in a simple, disciplined way, then
the situation is much better: if each future is touched
only once, either by the thread that created it or by a
later descendant of the thread that created it, then
parallel executions with work stealing can incur at
most $ O(C P T^2_\infty)$ additional cache misses-a
substantial improvement. This structured use of futures
is characteristic of many (but not all) parallel
applications.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{Thomson:2016:CTU,
author = "Paul Thomson and Alastair F. Donaldson and Adam
Betts",
title = "Concurrency Testing Using Controlled Schedulers: an
Empirical Study",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "23:1--23:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2858651",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present an independent empirical study on
concurrency testing using controlled schedulers. We
have gathered 49 buggy concurrent software benchmarks,
drawn from public code bases, which we call SCTBench.
We applied a modified version of an existing
concurrency testing tool to SCTBench, testing five
controlled scheduling techniques: depth-first search,
preemption bounding, delay bounding, a controlled
random scheduler, and probabilistic concurrency testing
(PCT). We attempt to answer several research questions:
Which technique performs the best, in terms of bug
finding ability? How effective are the two main
schedule bounding techniques-preemption bounding and
delay bounding-at finding bugs? What challenges are
associated with applying concurrency testing techniques
to existing code? Can we classify certain benchmarks as
trivial or nontrivial? Overall, we found that PCT (with
parameter d = 3) was the most effective technique in
terms of bug finding; it found all bugs found by the
other techniques, plus an additional three, and it
missed only one bug. Surprisingly, we found that the
naive controlled random scheduler, which randomly
chooses one thread to execute at each scheduling point,
performs well, finding more bugs than preemption
bounding and just two fewer bugs than delay bounding.
Our findings confirm that delay bounding is superior to
preemption bounding and that schedule bounding is
superior to an unbounded depth-first search. The
majority of bugs in SCTBench can be exposed using a
small schedule bound (1--2), supporting previous
claims, although one benchmark requires five
preemptions. We found that the need to remove
nondeterminism and control all synchronization (as is
required for systematic concurrency testing) can be
nontrivial. There were eight distinct programs that
could not easily be included in out study, such as
those that perform network and interprocess
communication. We report various properties about the
benchmarks tested, such as the fact that the bugs in 18
benchmarks were exposed 50\% of the time when using
random scheduling. We note that future work should not
use the benchmarks that we classify as trivial when
presenting new techniques, other than as a minimum
baseline. We have made SCTBench and our tools publicly
available for reproducibility and use in future work.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{Petrovic:2016:LHM,
author = "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e}
Schiper",
title = "Leveraging Hardware Message Passing for Efficient
Thread Synchronization",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "24:1--24:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2858652",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "As the level of parallelism in manycore processors
keeps increasing, providing efficient mechanisms for
thread synchronization in concurrent programs is
becoming a major concern. On cache-coherent
shared-memory processors, synchronization efficiency is
ultimately limited by the performance of the underlying
cache coherence protocol. This article studies how
hardware support for message passing can improve
synchronization performance. Considering the ubiquitous
problem of mutual exclusion, we devise novel algorithms
for (i) classic locking, where application threads
obtain exclusive access to a shared resource prior to
executing their critical sections (CSes), and (ii)
delegation, where CSes are executed by special threads.
For classic locking, our HybLock algorithm uses a mix
of shared memory and hardware message passing, which
introduces the idea of hybrid synchronization
algorithms. For delegation, we propose mp-server and
HybComb: the former is a straightforward adaptation of
the server approach to hardware message passing,
whereas the latter is a novel hybrid combining
algorithm. Evaluation on Tilera's TILE-Gx processor
shows that HybLock outperforms the best known classic
locks. Furthermore, mp-server can execute contended
CSes with unprecedented throughput, as stalls related
to cache coherence are removed from the critical path.
HybComb can achieve comparable performance while
avoiding the need to dedicate server cores.
Consequently, our queue and stack implementations,
based on the new synchronization algorithms, largely
outperform their most efficient shared-memory-only
counterparts.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{Tardieu:2016:XAP,
author = "Olivier Tardieu and Benjamin Herta and David
Cunningham and David Grove and Prabhanjan Kambadur and
Vijay Saraswat and Avraham Shinnar and Mikio Takeuchi
and Mandana Vaziri and Wei Zhang",
title = "{X10} and {APGAS} at Petascale",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "25:1--25:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2894746",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "X10 is a high-performance, high-productivity
programming language aimed at large-scale distributed
and shared-memory parallel applications. It is based on
the Asynchronous Partitioned Global Address Space
(APGAS) programming model, supporting the same
fine-grained concurrency mechanisms within and across
shared-memory nodes. We demonstrate that X10 delivers
solid performance at petascale by running (weak
scaling) eight application kernels on an IBM Power--775
supercomputer utilizing up to 55,680 Power7 cores (for
1.7Pflop/s of theoretical peak performance). For the
four HPC Class 2 Challenge benchmarks, X10 achieves
41\% to 87\% of the system's potential at scale (as
measured by IBM's HPCC Class 1 optimized runs). We also
implement K-Means, Smith-Waterman, Betweenness
Centrality, and Unbalanced Tree Search (UTS) for
geometric trees. Our UTS implementation is the first to
scale to petaflop systems. We describe the advances in
distributed termination detection, distributed load
balancing, and use of high-performance interconnects
that enable X10 to scale out to tens of thousands of
cores. We discuss how this work is driving the
evolution of the X10 language, core class libraries,
and runtime systems.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{Maleki:2016:LRM,
author = "Saeed Maleki and Madanlal Musuvathi and Todd
Mytkowicz",
title = "Low-Rank Methods for Parallelizing Dynamic Programming
Algorithms",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "26:1--26:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2884065",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "This article proposes efficient parallel methods for
an important class of dynamic programming problems that
includes Viterbi, Needleman--Wunsch, Smith--Waterman,
and Longest Common Subsequence. In dynamic programming,
the subproblems that do not depend on each other, and
thus can be computed in parallel, form stages or
wavefronts. The methods presented in this article
provide additional parallelism allowing multiple stages
to be computed in parallel despite dependencies among
them. The correctness and the performance of the
algorithm relies on rank convergence properties of
matrix multiplication in the tropical semiring, formed
with plus as the multiplicative operation and max as
the additive operation. This article demonstrates the
efficiency of the parallel algorithm by showing
significant speedups on a variety of important dynamic
programming problems. In particular, the parallel
Viterbi decoder is up to $ 24 \times $ faster (with 64
processors) than a highly optimized commercial
baseline.",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{Yuan:2016:FCN,
author = "Xin Yuan and Wickus Nienaber and Santosh Mahapatra",
title = "On Folded-{Clos} Networks with Deterministic
Single-Path Routing",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "27:1--27:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2858654",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Folded-Clos networks, also known as fat-trees, have
been widely used as interconnects in large-scale
high-performance computing clusters. Although users
often treat such interconnects as replacements of
nonblocking crossbar switches that can carry out any
permutation communication without contention, the
networking capability of such interconnects without a
centralized controller in computer communication
environments is not well understood. In this article,
we investigate nonblocking two-level folded-Clos
networks with deterministic single-path routing, but no
centralized controller, and establish the nonblocking
condition. The results indicate that nonblocking
two-level folded-Clos networks without a centralized
controller are much more expensive to construct than
the traditional nonblocking networks in the
telecommunication environment. Practical two-level
folded-Clos based interconnects are blocking. For such
interconnects, we establish the lower bound for
worst-case contention for permutations with any
deterministic single-path routing scheme, show that
existing routing schemes perform poorly in terms of
worst-case contention for permutations, present a
routing scheme that achieves the theoretical optimal,
and empirically compare the performance of existing
schemes with the optimal routing scheme. The techniques
developed for two-level folded-Clos networks are
further extended for the general fat-trees of any
heights.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{Sandes:2016:MMA,
author = "Edans F. De O. Sandes and Guillermo Miranda and Xavier
Martorell and Eduard Ayguade and George Teodoro and
Alba C. M. A. {De Melo}",
title = "{MASA}: a Multiplatform Architecture for Sequence
Aligners with Block Pruning",
journal = j-TOPC,
volume = "2",
number = "4",
pages = "28:1--28:??",
month = mar,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2858656",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 19 08:11:13 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Biological sequence alignment is a very popular
application in Bioinformatics, used routinely
worldwide. Many implementations of biological sequence
alignment algorithms have been proposed for multicores,
GPUs, FPGAs and CellBEs. These implementations are
platform-specific; porting them to other systems
requires considerable programming effort. This article
proposes and evaluates MASA, a flexible and
customizable software architecture that enables the
execution of biological sequence alignment applications
with three variants (local, global, and semiglobal) in
multiple hardware/software platforms with block
pruning, which is able to reduce significantly the
amount of data processed. To attain our flexibility
goals, we also propose a generic version of block
pruning and developed multiple parallelization
strategies as building blocks, including a new
asynchronous dataflow-based parallelization, which may
be combined to implement efficient aligners in
different platforms. We provide four MASA aligner
implementations for multicores (OmpSs and OpenMP), GPU
(CUDA), and Intel Phi (OpenMP), showing that MASA is
very flexible. The evaluation of our generic block
pruning strategy shows that it significantly
outperforms the previously proposed block pruning,
being able to prune up to 66.5\% of the cells when
using the new dataflow-based parallelization
strategy.",
acknowledgement = ack-nhfb,
articleno = "28",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
remark = "Special Issue on PPoPP'14 conference.",
}
@Article{MeyeraufderHeide:2016:ISI,
author = "Friedhelm {Meyer auf der Heide} and Peter Sanders and
Nodari Sitchinava",
title = "Introduction to the Special Issue on {SPAA 2014}",
journal = j-TOPC,
volume = "3",
number = "1",
pages = "1:1--1:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2936716",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:51 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Kaler:2016:EDD,
author = "Tim Kaler and William Hasenplaugh and Tao B. Schardl
and Charles E. Leiserson",
title = "Executing Dynamic Data-Graph Computations
Deterministically Using Chromatic Scheduling",
journal = j-TOPC,
volume = "3",
number = "1",
pages = "2:1--2:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2896850",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:51 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "A data-graph computation-popularized by such
programming systems as Galois, Pregel, GraphLab,
PowerGraph, and GraphChi-is an algorithm that performs
local updates on the vertices of a graph. During each
round of a data-graph computation, an update function
atomically modifies the data associated with a vertex
as a function of the vertex's prior data and that of
adjacent vertices. A dynamic data-graph computation
updates only an active subset of the vertices during a
round, and those updates determine the set of active
vertices for the next round. This article introduces
Prism, a chromatic-scheduling algorithm for executing
dynamic data-graph computations. Prism uses a vertex
coloring of the graph to coordinate updates performed
in a round, precluding the need for mutual-exclusion
locks or other nondeterministic data synchronization. A
multibag data structure is used by Prism to maintain a
dynamic set of active vertices as an unordered set
partitioned by color. We analyze Prism using work-span
analysis. Let $ G = (V, E) $ be a degree-$ \Delta $
graph colored with \chi colors, and suppose that $ Q
\subseteq V $ is the set of active vertices in a round.
Define $ {\rm size} (Q) = | Q | + \Sigma_{v \in Q} {\rm
deg}(v) $, which is proportional to the space required
to store the vertices of $Q$ using a sparse-graph
layout. We show that a $P$-processor execution of Prism
performs updates in $Q$ using $O(\chi (l g (Q / \chi)
+ l g \Delta)) + l g P$ span and $ \Theta (s i z e (Q)
+ P)$ work. These theoretical guarantees are matched by
good empirical performance. To isolate the effect of
the scheduling algorithm on performance, we modified
GraphLab to incorporate Prism and studied seven
application benchmarks on a 12-core multicore machine.
Prism executes the benchmarks 1.2 to 2.1 times faster
than GraphLab's nondeterministic lock-based scheduler
while providing deterministic behavior. This article
also presents Prism-R, a variation of Prism that
executes dynamic data-graph computations
deterministically even when updates modify global
variables with associative operations. Prism-R
satisfies the same theoretical bounds as Prism, but its
implementation is more involved, incorporating a
multivector data structure to maintain a
deterministically ordered set of vertices partitioned
by color. Despite its additional complexity, Prism-R is
only marginally slower than Prism. On the seven
application benchmarks studied, Prism-R incurs a 7\%
geometric mean overhead relative to Prism.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Im:2016:CST,
author = "Sungjin Im and Benjamin Moseley and Kirk Pruhs and
Eric Torng",
title = "Competitively Scheduling Tasks with Intermediate
Parallelizability",
journal = j-TOPC,
volume = "3",
number = "1",
pages = "4:1--4:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2938378",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:51 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We introduce a scheduling algorithm Intermediate-SRPT,
and show that it is $O (\log P)$-competitive with respect
to average flow time when scheduling jobs whose
parallelizability is intermediate between being fully
parallelizable and sequential. Here, the parameter P
denotes the ratio between the maximum job size to the
minimum. We also show a general matching lower bound on
the competitive ratio. Our analysis builds on an
interesting combination of potential function and local
competitiveness arguments.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Bercea:2016:CMI,
author = "Ioana O. Bercea and Navin Goyal and David G. Harris
and Aravind Srinivasan",
title = "On Computing Maximal Independent Sets of Hypergraphs
in Parallel",
journal = j-TOPC,
volume = "3",
number = "1",
pages = "5:1--5:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2938436",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:51 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Whether or not the problem of finding maximal
independent sets (MIS) in hypergraphs is in (R)NC is
one of the fundamental problems in the theory of
parallel computing. Essentially, the challenge is to
design (randomized) algorithms in which the number of
processors used is polynomial and the (expected)
runtime is polylogarithmic in the size of the input.
Unlike the well-understood case of MIS in graphs, for
the hypergraph problem, our knowledge is quite limited
despite considerable work. It is known that the problem
is in RNC when the edges of the hypergraph have
constant size. For general hypergraphs with n vertices
and m edges, the fastest previously known algorithm
works in time $O(\sqrt n) $ with $ \poly (m, n) $
processors. In this article, we give an EREW PRAM
randomized algorithm that works in time $ n^{o (1)} $
with $O(n + m \log n) $ processors on general
hypergraphs satisfying $ m \leq n^{o (1) \log \log n /
\log \log \log n} $. We also give an EREW PRAM
deterministic algorithm that runs in time $ n^\epsilon
$ on a graph with $ m \leq n^{1 / \delta } $ edges, for
any constants $ \delta $, $ \epsilon $; the number of
processors is polynomial in $m$, $n$ for a fixed choice
of $ \delta $, \epsilon . Our algorithms are based on a
sampling idea that reduces the dimension of the
hypergraph and employs the algorithm for constant
dimension hypergraphs as a subroutine.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Bilo:2016:LBN,
author = "Davide Bil{\`o} and Luciano Gual{\`a} and Stefano
Leucci and Guido Proietti",
title = "Locality-Based Network Creation Games",
journal = j-TOPC,
volume = "3",
number = "1",
pages = "6:1--6:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2938426",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:51 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Network creation games have been extensively studied,
both by economists and computer scientists, due to
their versatility in modeling individual-based
community formation processes. These processes, in
turn, are the theoretical counterpart of several
economics, social, and computational applications on
the Internet. In their several variants, these games
model the tension of a player between the player's two
antagonistic goals: to be as close as possible to the
other players and to activate a cheapest possible set
of links. However, the generally adopted assumption is
that players have a common and complete information
about the ongoing network, which is quite unrealistic
in practice. In this article, we consider a more
compelling scenario in which players have only limited
information about the network in which they are
embedded. More precisely, we explore the game-theoretic
and computational implications of assuming that players
have a complete knowledge of the network structure only
up to a given radius k, which is one of the most
qualified local-knowledge models used in distributed
computing. In this respect, we define a suitable
equilibrium concept, and we provide a comprehensive set
of upper and lower bounds to the price of anarchy for
the entire range of values of k and for the two classic
variants of the game, namely, those in which a player's
cost-besides the activation cost of the owned
links-depends on the maximum/sum of all distances to
the other nodes in the network, respectively. These
bounds are assessed through an extensive set of
experiments.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Jiang:2016:PPA,
author = "Jiayang Jiang and Michael Mitzenmacher and Justin
Thaler",
title = "Parallel Peeling Algorithms",
journal = j-TOPC,
volume = "3",
number = "1",
pages = "7:1--7:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2938412",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:51 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "The analysis of several algorithms and data structures
can be framed as a peeling process on a random
hypergraph: vertices with degree less than k are
removed until there are no vertices of degree less than
k left. The remaining hypergraph is known as the k
core. In this article, we analyze parallel peeling
processes, in which in each round, all vertices of
degree less than k are removed. It is known that, below
a specific edge-density threshold, the k -core is empty
with high probability. We show that, with high
probability, below this threshold, only $1 / \log((k -
1) (r - 1)) \log \log n + O (1)$ rounds of peeling are
needed to obtain the empty $k$-core for $4br$-uniform
hypergraphs. This bound is tight up to an additive
constant. Interestingly, we show that, above this
threshold, $\Omega (\log n)$ rounds of peeling are
required to find the nonempty $k$-core. Since most
algorithms and data structures aim to peel to an empty
$k$-core, this asymmetry appears fortunate. We verify
the theoretical results both with simulation and with a
parallel implementation using graphics processing units
(GPUs). Our implementation provides insights into how
to structure parallel peeling algorithms for efficiency
in practice.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Simhadri:2016:EAS,
author = "Harsha Vardhan Simhadri and Guy E. Blelloch and Jeremy
T. Fineman and Phillip B. Gibbons and Aapo Kyrola",
title = "Experimental Analysis of Space-Bounded Schedulers",
journal = j-TOPC,
volume = "3",
number = "1",
pages = "8:1--8:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2938389",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:51 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "The running time of nested parallel programs on
shared-memory machines depends in significant part on
how well the scheduler mapping the program to the
machine is optimized for the organization of caches and
processor cores on the machine. Recent work proposed
``space-bounded schedulers'' for scheduling such
programs on the multilevel cache hierarchies of current
machines. The main benefit of this class of schedulers
is that they provably preserve locality of the program
at every level in the hierarchy, which can result in
fewer cache misses and better use of bandwidth than the
popular work-stealing scheduler. On the other hand,
compared to work stealing, space-bounded schedulers are
inferior at load balancing and may have greater
scheduling overheads, raising the question as to the
relative effectiveness of the two schedulers in
practice. In this article, we provide the first
experimental study aimed at addressing this question.
To facilitate this study, we built a flexible
experimental framework with separate interfaces for
programs and schedulers. This enables a head-to-head
comparison of the relative strengths of schedulers in
terms of running times and cache miss counts across a
range of benchmarks. (The framework is validated by
comparisons with the Intel{\reg} CilkTM Plus
work-stealing scheduler.) We present experimental
results on a 32-core Xeon{\reg} 7560 comparing work
stealing, hierarchy-minded work stealing, and two
variants of space-bounded schedulers on both
divide-and-conquer microbenchmarks and some popular
algorithmic kernels. Our results indicate that
space-bounded schedulers reduce the number of L3 cache
misses compared to work-stealing schedulers by 25\% to
65\% for most of the benchmarks, but incur up to 27\%
additional scheduler and load-imbalance overhead. Only
for memory-intensive benchmarks can the reduction in
cache misses overcome the added overhead, resulting in
up to a 25\% improvement in running time for synthetic
benchmarks and about 20\% improvement for algorithmic
kernels. We also quantify runtime improvements varying
the available bandwidth per core (the ``bandwidth
gap'') and show up to 50\% improvements in the running
times of kernels as this gap increases fourfold. As
part of our study, we generalize prior definitions of
space-bounded schedulers to allow for more practical
variants (while still preserving their guarantees) and
explore implementation tradeoffs.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Sheikh:2016:SHJ,
author = "Hafiz Fahad Sheikh and Ishfaq Ahmad",
title = "Sixteen Heuristics for Joint Optimization of
Performance, Energy, and Temperature in Allocating
Tasks to Multi-Cores",
journal = j-TOPC,
volume = "3",
number = "2",
pages = "9:1--9:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2948973",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:52 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Three-way joint optimization of performance ( P ),
energy ( E ), and temperature ( T ) in scheduling
parallel tasks to multiple cores poses a challenge that
is staggering in its computational complexity. The goal
of the PET optimized scheduling ( PETOS ) problem is to
minimize three quantities: the completion time of a
task graph, the total energy consumption, and the peak
temperature of the system. Algorithms based on
conventional multi-objective optimization techniques
can be designed for solving the PETOS problem. But
their execution times are exceedingly high and hence
their applicability is restricted merely to problems of
modest size. Exacerbating the problem is the solution
space that is typically a Pareto front since no single
solution can be strictly best along all three
objectives. Thus, not only is the absolute quality of
the solutions important but ``the spread of the
solutions'' along each objective and the distribution
of solutions within the generated tradeoff front are
also desired. A natural alternative is to design
efficient heuristic algorithms that can generate good
solutions as well as good spreads --- note that most of
the prior work in energy-efficient task allocation is
predominantly single- or dual-objective oriented. Given
a directed acyclic graph (DAG) representing a parallel
program, a heuristic encompasses policies as to what
tasks should go to what cores and at what frequency
should that core operate. Various policies, such as
greedy, iterative, and probabilistic, can be employed.
However, the choice and usage of these policies can
influence a heuristic towards a particular objective
and can also profoundly impact its performance. This
article proposes 16 heuristics that utilize various
methods for task-to-core allocation and frequency
selection. This article also presents a methodical
classification scheme which not only categorizes the
proposed heuristics but can also accommodate additional
heuristics. Extensive simulation experiments compare
these algorithms while shedding light on their
strengths and tradeoffs.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Blanchard:2016:SMO,
author = "Jeffrey D. Blanchard and Erik Opavsky and Emircan
Uysaler",
title = "Selecting Multiple Order Statistics with a Graphics
Processing Unit",
journal = j-TOPC,
volume = "3",
number = "2",
pages = "10:1--10:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2948974",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:52 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Extracting a set of multiple order statistics from a
huge data set provides important information about the
distribution of the values in the full set of data.
This article introduces an algorithm,
bucketMultiSelect, for simultaneously selecting
multiple order statistics with a graphics processing
unit (GPU). Typically, when a large set of order
statistics is desired, the vector is sorted. When the
sorted version of the vector is not needed,
bucketMultiSelect significantly reduces computation
time by eliminating a large portion of the unnecessary
operations involved in sorting. For large vectors,
bucketMultiSelect returns thousands of order statistics
in less time than sorting the vector while typically
using less memory. For vectors containing $2^{28}$
values of type double, bucketMultiSelect selects the
101 percentile order statistics in less than 95ms and
is more than $8 \times $ faster than sorting the vector
with a GPU optimized merge sort.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Bohme:2016:IRC,
author = "David B{\"o}hme and Markus Geimer and Lukas Arnold and
Felix Voigtlaender and Felix Wolf",
title = "Identifying the Root Causes of Wait States in
Large-Scale Parallel Applications",
journal = j-TOPC,
volume = "3",
number = "2",
pages = "11:1--11:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2934661",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:52 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Driven by growing application requirements and
accelerated by current trends in microprocessor design,
the number of processor cores on modern supercomputers
is increasing from generation to generation. However,
load or communication imbalance prevents many codes
from taking advantage of the available parallelism, as
delays of single processes may spread wait states
across the entire machine. Moreover, when employing
complex point-to-point communication patterns, wait
states may propagate along far-reaching cause-effect
chains that are hard to track manually and that
complicate an assessment of the actual costs of an
imbalance. Building on earlier work by Meira, Jr., et
al., we present a scalable approach that identifies
program wait states and attributes their costs in terms
of resource waste to their original cause. By replaying
event traces in parallel both forward and backward, we
can identify the processes and call paths responsible
for the most severe imbalances, even for runs with
hundreds of thousands of processes.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Dathathri:2016:CAL,
author = "Roshan Dathathri and Ravi Teja Mullapudi and Uday
Bondhugula",
title = "Compiling Affine Loop Nests for a Dynamic Scheduling
Runtime on Shared and Distributed Memory",
journal = j-TOPC,
volume = "3",
number = "2",
pages = "12:1--12:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2948975",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:52 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Current de-facto parallel programming models like
OpenMP and MPI make it difficult to extract task-level
dataflow parallelism as opposed to bulk-synchronous
parallelism. Task parallel approaches that use
point-to-point synchronization between dependent tasks
in conjunction with dynamic scheduling dataflow
runtimes are thus becoming attractive. Although good
performance can be extracted for both shared and
distributed memory using these approaches, there is
little compiler support for them. In this article, we
describe the design of compiler--runtime interaction to
automatically extract coarse-grained dataflow
parallelism in affine loop nests for both shared and
distributed-memory architectures. We use techniques
from the polyhedral compiler framework to extract tasks
and generate components of the runtime that are used to
dynamically schedule the generated tasks. The runtime
includes a distributed decentralized scheduler that
dynamically schedules tasks on a node. The schedulers
on different nodes cooperate with each other through
asynchronous point-to-point communication, and all of
this is achieved by code automatically generated by the
compiler. On a set of six representative affine loop
nest benchmarks, while running on 32 nodes with 8
threads each, our compiler-assisted runtime yields a
geometric mean speedup of $ 143.6 \times $ ($ 70.3
\times $ to $ 474.7 \times $) over the sequential
version and a geometric mean speedup of $ 1.64 \times $
($ 1.04 \times $ to $ 2.42 \times $) over the
state-of-the-art automatic parallelization approach
that uses bulk synchronization. We also compare our
system with past work that addresses some of these
challenges on shared memory, and an emerging runtime
(Intel Concurrent Collections) that demands higher
programmer input and effort in parallelizing. To the
best of our knowledge, ours is also the first automatic
scheme that allows for dynamic scheduling of affine
loop nests on a cluster of multicores.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Benoit:2016:AGP,
author = "Anne Benoit and Aur{\'e}lien Cavelan and Yves Robert
and Hongyang Sun",
title = "Assessing General-Purpose Algorithms to Cope with
Fail-Stop and Silent Errors",
journal = j-TOPC,
volume = "3",
number = "2",
pages = "13:1--13:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2897189",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:52 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "In this article, we combine the traditional
checkpointing and rollback recovery strategies with
verification mechanisms to cope with both fail-stop and
silent errors. The objective is to minimize makespan
and/or energy consumption. For divisible load
applications, we use first-order approximations to find
the optimal checkpointing period to minimize execution
time, with an additional verification mechanism to
detect silent errors before each checkpoint, hence
extending the classical formula by Young and Daly for
fail-stop errors only. We further extend the approach
to include intermediate verifications, and to consider
a bicriteria problem involving both time and energy
(linear combination of execution time and energy
consumption). Then, we focus on application workflows
whose dependence graph is a linear chain of tasks.
Here, we determine the optimal checkpointing and
verification locations, with or without intermediate
verifications, for the bicriteria problem. Rather than
using a single speed during the whole execution, we
further introduce a new execution scenario, which
allows for changing the execution speed via Dynamic
Voltage and Frequency Scaling (DVFS). In this latter
scenario, we determine the optimal checkpointing and
verification locations, as well as the optimal speed
pairs for each task segment between any two consecutive
checkpoints. Finally, we conduct an extensive set of
simulations to support the theoretical study, and to
assess the performance of each algorithm, showing that
the best overall performance is achieved under the most
flexible scenario using intermediate verifications and
different speeds.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Koutis:2016:SPD,
author = "Ioannis Koutis and Shen Chen Xu",
title = "Simple Parallel and Distributed Algorithms for
Spectral Graph Sparsification",
journal = j-TOPC,
volume = "3",
number = "2",
pages = "14:1--14:??",
month = aug,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2948062",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Fri Sep 23 15:24:52 MDT 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We describe simple algorithms for spectral graph
sparsification, based on iterative computations of
weighted spanners and sampling. Leveraging the
algorithms of Baswana and Sen for computing spanners,
we obtain the first distributed spectral sparsification
algorithm in the CONGEST model. We also obtain a
parallel algorithm with improved work and time
guarantees, as well as other natural distributed
implementations. Combining this algorithm with the
parallel framework of Peng and Spielman for solving
symmetric diagonally dominant linear systems, we get a
parallel solver that is significantly more efficient in
terms of the total work.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Dorier:2016:DAP,
author = "Matthieu Dorier and Gabriel Antoniu and Franck
Cappello and Marc Snir and Robert Sisneros and Orcun
Yildiz and Shadi Ibrahim and Tom Peterka and Leigh
Orf",
title = "{Damaris}: Addressing Performance Variability in Data
Management for Post-Petascale Simulations",
journal = j-TOPC,
volume = "3",
number = "3",
pages = "15:1--15:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2987371",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Dec 26 17:40:41 MST 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "With exascale computing on the horizon, reducing
performance variability in data management tasks
(storage, visualization, analysis, etc.) is becoming a
key challenge in sustaining high performance. This
variability significantly impacts the overall
application performance at scale and its predictability
over time. In this article, we present Damaris, a
system that leverages dedicated cores in multicore
nodes to offload data management tasks, including I/O,
data compression, scheduling of data movements, in situ
analysis, and visualization. We evaluate Damaris with
the CM1 atmospheric simulation and the Nek5000
computational fluid dynamic simulation on four
platforms, including NICS's Kraken and NCSA's Blue
Waters. Our results show that (1) Damaris fully hides
the I/O variability as well as all I/O-related costs,
thus making simulation performance predictable; (2) it
increases the sustained write throughput by a factor of
up to 15 compared with standard I/O approaches; (3) it
allows almost perfect scalability of the simulation up
to over 9,000 cores, as opposed to state-of-the-art
approaches that fail to scale; and (4) it enables a
seamless connection to the VisIt visualization software
to perform in situ analysis and visualization in a way
that impacts neither the performance of the simulation
nor its variability. In addition, we extended our
implementation of Damaris to also support the use of
dedicated nodes and conducted a thorough comparison of
the two approaches-dedicated cores and dedicated
nodes-for I/O tasks with the aforementioned
applications.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Gao:2016:AOM,
author = "Jiaquan Gao and Yu Wang and Jun Wang and Ronghua
Liang",
title = "Adaptive Optimization Modeling of Preconditioned
Conjugate Gradient on Multi-{GPUs}",
journal = j-TOPC,
volume = "3",
number = "3",
pages = "16:1--16:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/2990849",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Dec 26 17:40:41 MST 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "The preconditioned conjugate gradient (PCG) algorithm
is a well-known iterative method for solving sparse
linear systems in scientific computations.
GPU-accelerated PCG algorithms for large-sized problems
have attracted considerable attention recently.
However, on a specific multi-GPU platform, producing a
highly parallel PCG implementation for any large-sized
problem requires significant time because several
manual steps are involved in adjusting the related
parameters and selecting an appropriate storage format
for the matrix block that is assigned to each GPU. This
motivates us to propose adaptive optimization modeling
of PCG on multi-GPUs, which mainly involves the
following parts: (1) an optimization multi-GPU parallel
framework of PCG and (2) the profile-based optimization
modeling for each one of the main components of the PCG
algorithm, including vector operation, inner product,
and sparse matrix-vector multiplication (SpMV). Our
model does not construct a new storage format or kernel
but automatically and rapidly generates an optimal
parallel PCG algorithm for any problem on a specific
multi-GPU platform by integrating existing storage
formats and kernels. We take a vector operation kernel,
an inner-product kernel, and five popular SpMV kernels
for an example to present the idea of constructing the
model. Given that our model is general, independent of
the problems, and dependent on the resources of
devices, this model is constructed only once for each
type of GPU. The experiments validate the high
efficiency of our proposed model.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Creech:2016:TSS,
author = "Timothy Creech and Rajeev Barua",
title = "Transparently Space Sharing a Multicore Among Multiple
Processes",
journal = j-TOPC,
volume = "3",
number = "3",
pages = "17:1--17:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3001910",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Dec 26 17:40:41 MST 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "As hardware becomes increasingly parallel and the
availability of scalable parallel software improves,
the problem of managing multiple multithreaded
applications (processes) becomes important. Malleable
processes, which can vary the number of threads used as
they run, enable sophisticated and flexible resource
management. Although many existing applications
parallelized for SMPs with parallel runtimes are in
fact already malleable, deployed runtime environments
provide no interface nor any strategy for intelligently
allocating hardware threads or even preventing
oversubscription. Prior research methods either depend
on profiling applications ahead of time to make good
decisions about allocations or do not account for
process efficiency at all, leading to poor performance.
None of these prior methods have been adapted widely in
practice. This article presents the Scheduling and
Allocation with Feedback (SCAF) system: a drop-in
runtime solution that supports existing malleable
applications in making intelligent allocation decisions
based on observed efficiency without any changes to
semantics, program modification, offline profiling, or
even recompilation. Our existing implementation can
control most unmodified OpenMP applications. Other
malleable threading libraries can also easily be
supported with small modifications without requiring
application modification or recompilation. In this
work, we present the SCAF daemon and a SCAF-aware port
of the GNU OpenMP runtime. We present a new technique
for estimating process efficiency purely at runtime
using available hardware counters and demonstrate its
effectiveness in aiding allocation decisions. We
evaluated SCAF using NAS NPB parallel benchmarks on
five commodity parallel platforms, enumerating
architectural features and their effects on our scheme.
We measured the benefit of SCAF in terms of sum of
speedups improvement (a common metric for
multiprogrammed environments) when running all
benchmark pairs concurrently compared to
equipartitioning-the best existing competing scheme in
the literature. We found that SCAF improves on
equipartitioning on four out of five machines, showing
a mean improvement factor in sum of speedups of 1.04 to
1.11x for benchmark pairs, depending on the machine,
and 1.09x on average. Since we are not aware of any
widely available tool for equipartitioning, we also
compare SCAF against multiprogramming using unmodified
OpenMP, which is the only environment available to end
users today. SCAF improves on the unmodified OpenMP
runtimes for all five machines, with a mean improvement
of 1.08 to 2.07x, depending on the machine, and 1.59x
on average.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Ballard:2016:HPS,
author = "Grey Ballard and Alex Druinsky and Nicholas Knight and
Oded Schwartz",
title = "Hypergraph Partitioning for Sparse Matrix--Matrix
Multiplication",
journal = j-TOPC,
volume = "3",
number = "3",
pages = "18:1--18:??",
month = dec,
year = "2016",
CODEN = "????",
DOI = "https://doi.org/10.1145/3015144",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Dec 26 17:40:41 MST 2016",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We propose a fine-grained hypergraph model for sparse
matrix-matrix multiplication (SpGEMM), a key
computational kernel in scientific computing and data
analysis whose performance is often communication
bound. This model correctly describes both the
interprocessor communication volume along a critical
path in a parallel computation and also the volume of
data moving through the memory hierarchy in a
sequential computation. We show that identifying a
communication-optimal algorithm for particular input
matrices is equivalent to solving a hypergraph
partitioning problem. Our approach is nonzero structure
dependent, meaning that we seek the best algorithm for
the given input matrices. In addition to our
three-dimensional fine-grained model, we also propose
coarse-grained one-dimensional and two-dimensional
models that correspond to simpler SpGEMM algorithms. We
explore the relations between our models theoretically,
and we study their performance experimentally in the
context of three applications that use SpGEMM as a key
computation. For each application, we find that at
least one coarse-grained model is as communication
efficient as the fine-grained model. We also observe
that different applications have affinities for
different algorithms. Our results demonstrate that
hypergraphs are an accurate model for reasoning about
the communication costs of SpGEMM as well as a
practical tool for exploring the SpGEMM algorithm
design space.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Grove:2017:ISS,
author = "David Grove",
title = "Introduction to the Special Section on {PPoPP'15}",
journal = j-TOPC,
volume = "3",
number = "4",
pages = "19:1--19:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3040224",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 25 07:55:06 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Majo:2017:LPC,
author = "Zoltan Majo and Thomas R. Gross",
title = "A Library for Portable and Composable Data Locality
Optimizations for {NUMA} Systems",
journal = j-TOPC,
volume = "3",
number = "4",
pages = "20:1--20:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3040222",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 25 07:55:06 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Many recent multiprocessor systems are realized with a
nonuniform memory architecture (NUMA) and accesses to
remote memory locations take more time than local
memory accesses. Optimizing NUMA memory system
performance is difficult and costly for three principal
reasons: (1) Today's programming languages/libraries
have no explicit support for NUMA systems, (2) NUMA
optimizations are not portable, and (3) optimizations
are not composable (i.e., they can become ineffective
or worsen performance in environments that support
composable parallel software). This article presents
TBB-NUMA, a parallel programming library based on Intel
Threading Building Blocks (TBB) that supports portable
and composable NUMA-aware programming. TBB-NUMA
provides a model of task affinity that captures a
programmer's insights on mapping tasks to resources.
NUMA-awareness affects all layers of the library (i.e.,
resource management, task scheduling, and high-level
parallel algorithm templates) and requires close
coupling between all these layers. Optimizations
implemented with TBB-NUMA (for a set of standard
benchmark programs) result in up to 44\% performance
improvement over standard TBB. But more important,
optimized programs are portable across different NUMA
architectures and preserve data locality also when
composed with other parallel computations sharing the
same resource management layer.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Golan-Gueta:2017:ASA,
author = "Guy Golan-Gueta and G. Ramalingam and Mooly Sagiv and
Eran Yahav",
title = "Automatic Scalable Atomicity via Semantic Locking",
journal = j-TOPC,
volume = "3",
number = "4",
pages = "21:1--21:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3040223",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 25 07:55:06 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "In this article, we consider concurrent programs in
which the shared state consists of instances of
linearizable abstract data types (ADTs). We present an
automated approach to concurrency control that
addresses a common need: the need to atomically execute
a code fragment, which may contain multiple ADT
operations on multiple ADT instances. We present a
synthesis algorithm that automatically enforces
atomicity of given code fragments (in a client program)
by inserting pessimistic synchronization that
guarantees atomicity and deadlock-freedom (without
using any rollback mechanism). Our algorithm takes a
commutativity specification as an extra input. This
specification indicates for every pair of ADT
operations the conditions under which the operations
commute. Our algorithm enables greater parallelism by
permitting commuting operations to execute
concurrently. We have implemented the synthesis
algorithm in a Java compiler and applied it to several
Java programs. Our results show that our approach
produces efficient and scalable synchronization.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Izraelevitz:2017:GSN,
author = "Joseph Izraelevitz and Michael L. Scott",
title = "Generality and Speed in Nonblocking Dual Containers",
journal = j-TOPC,
volume = "3",
number = "4",
pages = "22:1--22:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3040220",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 25 07:55:06 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Nonblocking dual data structures extend traditional
notions of nonblocking progress to accommodate partial
methods, both by bounding the number of steps that a
thread can execute after its preconditions have been
satisfied and by ensuring that a waiting thread
performs no remote memory accesses that could interfere
with the execution of other threads. A nonblocking dual
container, in particular, is designed to hold either
data or requests. An insert operation either adds data
to the container or removes and satisfies a request; a
remove operation either takes data out of the container
or inserts a request. We present the first
general-purpose construction for nonblocking dual
containers, allowing any nonblocking container for data
to be paired with almost any nonblocking container for
requests. We also present new custom algorithms, based
on the LCRQ of Morrison and Afek, that outperform the
fastest previously known dual containers by factors of
four to six.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Cole:2017:ROS,
author = "Richard Cole and Vijaya Ramachandran",
title = "Resource Oblivious Sorting on Multicores",
journal = j-TOPC,
volume = "3",
number = "4",
pages = "23:1--23:??",
month = mar,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3040221",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Sat Mar 25 07:55:06 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present a deterministic sorting algorithm, Sample,
Partition, and Merge Sort (SPMS), that interleaves the
partitioning of a sample sort with merging.
Sequentially, it sorts $n$ elements in $ O(n \log n)$
time cache-obliviously with an optimal number of cache
misses. The parallel complexity (or critical path
length) of the algorithm is $ O(\log n \log \log n)$,
which improves on previous bounds for deterministic
sample sort. The algorithm also has low false sharing
costs. When scheduled by a work-stealing scheduler in a
multicore computing environment with a global shared
memory and p cores, each having a cache of size $M$
organized in blocks of size $B$, the costs of the
additional cache misses and false sharing misses due to
this parallel execution are bounded by the cost of $
O(S \cdot M / B)$ and $ O(S \cdot B)$ cache misses,
respectively, where $S$ is the number of steals
performed during the execution. Finally, SPMS is
resource oblivious in that the dependence on machine
parameters appear only in the analysis of its
performance and not within the algorithm itself.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Ballard:2017:GEIa,
author = "Grey Ballard and Mary Hall and Tim Harris and Brandon
Lucia",
title = "{Guest Editor} Introduction {PPoPP 2016}, Special
Issue 2 of 2",
journal = j-TOPC,
volume = "4",
number = "1",
pages = "1:1--1:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3108141",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Ashkiani:2017:GME,
author = "Saman Ashkiani and Andrew Davidson and Ulrich Meyer
and John D. Owens",
title = "{GPU Multisplit}: an Extended Study of a Parallel
Algorithm",
journal = j-TOPC,
volume = "4",
number = "1",
pages = "2:1--2:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3108139",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Multisplit is a broadly useful parallel primitive that
permutes its input data into contiguous buckets or
bins, where the function that categorizes an element
into a bucket is provided by the programmer. Due to the
lack of an efficient multisplit on Graphics Processing
Units (GPUs), programmers often choose to implement
multisplit with a sort. One way is to first generate an
auxiliary array of bucket IDs and then sort input data
based on it. In case smaller indexed buckets possess
smaller valued keys, another way for multisplit is to
directly sort input data. Both methods are inefficient
and require more work than necessary: The former
requires more expensive data movements while the latter
spends unnecessary effort in sorting elements within
each bucket. In this work, we provide a parallel model
and multiple implementations for the multisplit
problem. Our principal focus is multisplit for a small
(up to 256) number of buckets. We use warp-synchronous
programming models and emphasize warpwide
communications to avoid branch divergence and reduce
memory usage. We also hierarchically reorder input
elements to achieve better coalescing of global memory
accesses. On a GeForce GTX 1080 GPU, we can reach a
peak throughput of 18.93Gkeys/s (or 11.68Gpairs/s) for
a key-only (or key-value) multisplit. Finally, we
demonstrate how multisplit can be used as a building
block for radix sort. In our multisplit-based sort
implementation, we achieve comparable performance to
the fastest GPU sort routines, sorting 32-bit keys (and
key-value pairs) with a throughput of 3.0Gkeys/s (and
2.1Gpair/s).",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Wang:2017:GGG,
author = "Yangzihao Wang and Yuechao Pan and Andrew Davidson and
Yuduo Wu and Carl Yang and Leyuan Wang and Muhammad
Osama and Chenshan Yuan and Weitang Liu and Andy T.
Riffel and John D. Owens",
title = "{Gunrock}: {GPU} Graph Analytics",
journal = j-TOPC,
volume = "4",
number = "1",
pages = "3:1--3:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3108140",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "For large-scale graph analytics on the GPU, the
irregularity of data access and control flow, and the
complexity of programming GPUs, have presented two
significant challenges to developing a programmable
high-performance graph library. ``Gunrock,'' our
graph-processing system designed specifically for the
GPU, uses a high-level, bulk-synchronous, data-centric
abstraction focused on operations on a vertex or edge
frontier. Gunrock achieves a balance between
performance and expressiveness by coupling
high-performance GPU computing primitives and
optimization strategies with a high-level programming
model that allows programmers to quickly develop new
graph primitives with small code size and minimal GPU
programming knowledge. We characterize the performance
of various optimization strategies and evaluate
Gunrock's overall performance on different GPU
architectures on a wide range of graph primitives that
span from traversal-based algorithms and ranking
algorithms, to triangle counting and
bipartite-graph-based algorithms. The results show that
on a single GPU, Gunrock has on average at least an
order of magnitude speedup over Boost and PowerGraph,
comparable performance to the fastest GPU hardwired
primitives and CPU shared-memory graph libraries, such
as Ligra and Galois, and better performance than any
other GPU high-level graph library.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Chowdhury:2017:AAD,
author = "Rezaul Chowdhury and Pramod Ganapathi and Stephen
Tschudi and Jesmin Jahan Tithi and Charles Bachmeier
and Charles E. Leiserson and Armando Solar-Lezama and
Bradley C. Kuszmaul and Yuan Tang",
title = "{Autogen}: Automatic Discovery of Efficient Recursive
Divide-\&-Conquer Algorithms for Solving Dynamic
Programming Problems",
journal = j-TOPC,
volume = "4",
number = "1",
pages = "4:1--4:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3125632",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present Autogen --- an algorithm that for a wide
class of dynamic programming (DP) problems
automatically discovers highly efficient
cache-oblivious parallel recursive divide-and-conquer
algorithms from inefficient iterative descriptions of
DP recurrences. Autogen analyzes the set of DP table
locations accessed by the iterative algorithm when run
on a DP table of small size and automatically
identifies a recursive access pattern and a
corresponding provably correct recursive algorithm for
solving the DP recurrence. We use Autogen to
autodiscover efficient algorithms for several
well-known problems. Our experimental results show that
several autodiscovered algorithms significantly
outperform parallel looping and tiled loop-based
algorithms. Also, these algorithms are less sensitive
to fluctuations of memory and bandwidth compared with
their looping counterparts, and their running times and
energy profiles remain relatively more stable. To the
best of our knowledge, Autogen is the first algorithm
that can automatically discover new nontrivial
divide-and-conquer algorithms.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Steele:2017:AAC,
author = "Guy L. {Steele Jr.} and Jean-Baptiste Tristan",
title = "Adding Approximate Counters",
journal = j-TOPC,
volume = "4",
number = "1",
pages = "5:1--5:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3132167",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We describe a general framework for adding the values
of two approximate counters to produce a new
approximate counter value whose expected estimated
value is equal to the sum of the expected estimated
values of the given approximate counters. (To the best
of our knowledge, this is the first published
description of any algorithm for adding two approximate
counters.) We then work out implementation details for
five different kinds of approximate counter and provide
optimized pseudocode. For three of them, we present
proofs that the variance of a counter value produced by
adding two counter values in this way is bounded, and
in fact is no worse, or not much worse, than the
variance of the value of a single counter to which the
same total number of increment operations have been
applied. Addition of approximate counters is useful in
massively parallel divide-and-conquer algorithms that
use a distributed representation for large arrays of
counters. We describe two machine-learning algorithms
for topic modeling that use millions of integer
counters and confirm that replacing the integer
counters with approximate counters is effective,
speeding up a GPU-based implementation by over 65\% and
a CPU-based implementation by nearly 50\%, as well as
reducing memory requirements, without degrading their
statistical effectiveness.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Ballard:2017:GEIb,
author = "Grey Ballard and Mary Hall and Tim Harris and Brandon
Lucia",
title = "{Guest Editor} Introduction {PPoPP 2016}, Special
Issue 2 of 2",
journal = j-TOPC,
volume = "4",
number = "2",
pages = "6:1--6:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3108142",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Kalikar:2017:DNM,
author = "Saurabh Kalikar and Rupesh Nasre",
title = "{DomLock}: a New Multi-Granularity Locking Technique
for Hierarchies",
journal = j-TOPC,
volume = "4",
number = "2",
pages = "7:1--7:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3127584",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We present efficient locking mechanisms for
hierarchical data structures. Several applications work
on an abstract hierarchy of objects, and a parallel
execution on this hierarchy necessitates
synchronization across workers operating on different
parts of the hierarchy. Existing synchronization
mechanisms are too coarse, too inefficient, or too ad
hoc, resulting in reduced or unpredictable amount of
concurrency. We propose a new locking approach based on
the structural properties of the underlying hierarchy.
We show that the developed techniques are efficient
even when the hierarchy is an arbitrary graph.
Theoretically, we present our approach as a
locking-cost-minimizing instance of a generic algebraic
model of synchronization for hierarchies. Using
STMBench7, we illustrate considerable reduction in the
locking cost, resulting in an average throughput
improvement of 42\%.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Haider:2017:LRA,
author = "Syed Kamran Haider and William Hasenplaugh and Dan
Alistarh",
title = "{Lease\slash Release}: Architectural Support for
Scaling Contended Data Structures",
journal = j-TOPC,
volume = "4",
number = "2",
pages = "8:1--8:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3132168",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "High memory contention is generally agreed to be a
worst-case scenario for concurrent data structures.
There has been a significant amount of research effort
spent investigating designs that minimize contention,
and several programming techniques have been proposed
to mitigate its effects. However, there are currently
few architectural mechanisms to allow scaling contended
data structures at high thread counts. In this article,
we investigate hardware support for scalable contended
data structures. We propose Lease/Release, a simple
addition to standard directory-based MESI cache
coherence protocols, allowing participants to lease
memory, at the granularity of cache lines, by delaying
coherence messages for a short, bounded period of time.
Our analysis shows that Lease/Release can significantly
reduce the overheads of contention for both
non-blocking (lock-free) and lock-based data structure
implementations while ensuring that no deadlocks are
introduced. We validate Lease/Release empirically on
the Graphite multiprocessor simulator on a range of
data structures, including queue, stack, and priority
queue implementations, as well as on transactional
applications. Results show that Lease/Release
consistently improves both throughput and energy usage,
by up to 5x, both for lock-free and lock-based data
structure designs.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Cao:2017:HRD,
author = "Man Cao and Minjia Zhang and Aritra Sengupta and
Swarnendu Biswas and Michael D. Bond",
title = "Hybridizing and Relaxing Dependence Tracking for
Efficient Parallel Runtime Support",
journal = j-TOPC,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3108138",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "It is notoriously challenging to develop parallel
software systems that are both scalable and correct.
Runtime support for parallelism-such as multithreaded
record and replay, data race detectors, transactional
memory, and enforcement of stronger memory models-helps
achieve these goals, but existing commodity solutions
slow programs substantially to track (i.e., detect or
control) an execution's cross-thread dependencies
accurately. Prior work tracks cross-thread dependencies
either ``pessimistically,'' slowing every program
access, or ``optimistically,'' allowing for lightweight
instrumentation of most accesses but dramatically
slowing accesses that are conflicting (i.e., involved
in cross-thread dependencies). This article presents
two novel approaches that seek to improve the
performance of dependence tracking. Hybrid tracking
(HT) hybridizes pessimistic and optimistic tracking by
overcoming a fundamental mismatch between these two
kinds of tracking. HT uses an adaptive, profile-based
policy to make runtime decisions about switching
between pessimistic and optimistic tracking. Relaxed
tracking (RT) attempts to reduce optimistic tracking's
overhead on conflicting accesses by tracking
dependencies in a ``relaxed'' way-meaning that not all
dependencies are tracked accurately-while still
preserving both program semantics and runtime support's
correctness. To demonstrate the usefulness and
potential of HT and RT, we build runtime support based
on the two approaches. Our evaluation shows that both
approaches offer performance advantages over existing
approaches, but there exist challenges and
opportunities for further improvement. HT and RT are
distinct solutions to the same problem. It is easier to
build runtime support based on HT than on RT, although
RT does not incur the overhead of online profiling.
This article presents the two approaches together to
inform and inspire future designs for efficient
parallel runtime support.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Chatzopoulos:2017:EES,
author = "Georgios Chatzopoulos and Aleksandar Dragojevi{\'c}
and Rachid Guerraoui",
title = "{ESTIMA}: Extrapolating {ScalabiliTy} of In-Memory
Applications",
journal = j-TOPC,
volume = "4",
number = "2",
pages = "10:1--10:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3108137",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "This article presents estima, an easy-to-use tool for
extrapolating the scalability of in-memory
applications. estima is designed to perform a simple
yet important task: Given the performance of an
application on a small machine with a handful of cores,
estima extrapolates its scalability to a larger machine
with more cores, while requiring minimum input from the
user. The key idea underlying estima is the use of
stalled cycles (e.g., cycles that the processor spends
waiting for missed cache line fetches or busy locks).
estima measures stalled cycles on a few cores and
extrapolates them to more cores, estimating the amount
of waiting in the system. estima can be effectively
used to predict the scalability of in-memory
applications for bigger execution machines. For
instance, using measurements of memcached and SQLite on
a desktop machine, we obtain accurate predictions of
their scalability on a server. Our extensive evaluation
shows the effectiveness of estima on a large number of
in-memory benchmarks.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Gulisano:2017:EDS,
author = "Vincenzo Gulisano and Yiannis Nikolakopoulos and
Daniel Cederman and Marina Papatriantafilou and
Philippas Tsigas",
title = "Efficient Data Streaming Multiway Aggregation through
Concurrent Algorithmic Designs and New Abstract Data
Types",
journal = j-TOPC,
volume = "4",
number = "2",
pages = "11:1--11:??",
month = oct,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3131272",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Tue Oct 10 17:42:07 MDT 2017",
bibsource = "http://topc.acm.org/;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Data streaming relies on continuous queries to process
unbounded streams of data in a real-time fashion. It is
commonly demanding in computation capacity, given that
the relevant applications involve very large volumes of
data. Data structures act as articulation points and
maintain the state of data streaming operators,
potentially supporting high parallelism and balancing
the work among them. Prompted by this fact, in this
work we study and analyze parallelization needs of
these articulation points, focusing on the problem of
streaming multiway aggregation, where large data
volumes are received from multiple input streams. The
analysis of the parallelization needs, as well as of
the use and limitations of existing aggregate designs
and their data structures, leads us to identify needs
for appropriate shared objects that can achieve
low-latency and high-throughput multiway aggregation.
We present the requirements of such objects as abstract
data types and we provide efficient lock-free
linearizable algorithmic implementations of them, along
with new multiway aggregate algorithmic designs that
leverage them, supporting both deterministic
order-sensitive and order-insensitive aggregate
functions. Furthermore, we point out future directions
that open through these contributions. The article
includes an extensive experimental study, based on a
variety of continuous aggregation queries on two large
datasets extracted from SoundCloud, a music social
network, and from a Smart Grid network. In all the
experiments, the proposed data structures and the
enhanced aggregate operators improved the processing
performance significantly, up to one order of
magnitude, in terms of both throughput and latency,
over the commonly used techniques based on queues.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Malas:2018:MIP,
author = "Tareq M. Malas and Georg Hager and Hatem Ltaief and
David E. Keyes",
title = "Multidimensional Intratile Parallelization for
Memory-Starved Stencil Computations",
journal = j-TOPC,
volume = "4",
number = "3",
pages = "12:1--12:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3155290",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Optimizing the performance of stencil algorithms has
been the subject of intense research over the last two
decades. Since many stencil schemes have low arithmetic
intensity, most optimizations focus on increasing the
temporal data access locality, thus reducing the data
traffic through the main memory interface with the
ultimate goal of decoupling from this bottleneck. There
are, however, only a few approaches that explicitly
leverage the shared cache feature of modern multicore
chips. If every thread works on its private, separate
cache block, the available cache space can become too
small, and sufficient temporal locality may not be
achieved. We propose a flexible multidimensional
intratile parallelization method for stencil algorithms
on multicore CPUs with a shared outer-level cache. This
method leads to a significant reduction in the required
cache space without adverse effects from hardware
prefetching or TLB shortage. Our Girih framework
includes an autotuner to select optimal parameter
configurations on the target hardware. We conduct
performance experiments on two contemporary Intel
processors and compare with the state-of-the-art
stencil frameworks Pluto and Pochoir, using four
corner-case stencil schemes and a wide range of problem
sizes. Girih shows substantial performance advantages
and best arithmetic intensity at almost all problem
sizes, especially on low-intensity stencils with
variable coefficients. We study in detail the
performance behavior at varying grid sizes using
phenomenological performance modeling. Our analysis of
energy consumption reveals that our method can save
energy through reduced DRAM bandwidth usage even at a
marginal performance gain. It is thus well suited for
future architectures that will be strongly challenged
by the cost of data movement, be it in terms of
performance or energy consumption.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Akbudak:2018:PMS,
author = "Kadir Akbudak and Oguz Selvitopi and Cevdet Aykanat",
title = "Partitioning Models for Scaling Parallel Sparse
Matrix--Matrix Multiplication",
journal = j-TOPC,
volume = "4",
number = "3",
pages = "13:1--13:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3155292",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We investigate outer-product--parallel,
inner-product--parallel, and
row-by-row-product--parallel formulations of sparse
matrix-matrix multiplication (SpGEMM) on distributed
memory architectures. For each of these three
formulations, we propose a hypergraph model and a
bipartite graph model for distributing SpGEMM
computations based on one-dimensional (1D) partitioning
of input matrices. We also propose a communication
hypergraph model for each formulation for distributing
communication operations. The computational graph and
hypergraph models adopted in the first phase aim at
minimizing the total message volume and balancing the
computational loads of processors, whereas the
communication hypergraph models adopted in the second
phase aim at minimizing the total message count and
balancing the message volume loads of processors. That
is, the computational partitioning models reduce the
bandwidth cost and the communication hypergraph models
reduce the latency cost. Our extensive parallel
experiments on up to 2048 processors for a wide range
of realistic SpGEMM instances show that although the
outer-product--parallel formulation scales better, the
row-by-row-product--parallel formulation is more viable
due to its significantly lower partitioning overhead
and competitive scalability. For computational
partitioning models, our experimental findings indicate
that the proposed bipartite graph models are attractive
alternatives to their hypergraph counterparts because
of their lower partitioning overhead. Finally, we show
that by reducing the latency cost besides the bandwidth
cost through using the communication hypergraph models,
the parallel SpGEMM time can be further improved up to
32\%.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Bilardi:2018:LBT,
author = "Gianfranco Bilardi and Michele Scquizzato and
Francesco Silvestri",
title = "A Lower Bound Technique for Communication in {BSP}",
journal = j-TOPC,
volume = "4",
number = "3",
pages = "14:1--14:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3181776",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Communication is a major factor determining the
performance of algorithms on current computing systems;
it is therefore valuable to provide tight lower bounds
on the communication complexity of computations. This
article presents a lower bound technique for the
communication complexity in the bulk-synchronous
parallel (BSP) model of a given class of DAG
computations. The derived bound is expressed in terms
of the switching potential of a DAG, that is, the
number of permutations that the DAG can realize when
viewed as a switching network. The proposed technique
yields tight lower bounds for the fast Fourier
transform (FFT), and for any sorting and permutation
network. A stronger bound is also derived for the
periodic balanced sorting network, by applying this
technique to suitable subnetworks. Finally, we
demonstrate that the switching potential captures
communication requirements even in computational models
different from BSP, such as the I/O model and the
LPRAM.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Sahin:2018:CSC,
author = "Semih Sahin and Bugra Gedik",
title = "{C-Stream}: a Co-routine-Based Elastic Stream
Processing Engine",
journal = j-TOPC,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = apr,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3184120",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Stream processing is a computational paradigm for
on-the-fly processing of live data. This paradigm lends
itself to implementations that can provide high
throughput and low latency by taking advantage of
various forms of parallelism that are naturally
captured by the stream processing model of computation,
such as pipeline, task, and data parallelism. In this
article, we describe the design and implementation of
C-Stream, which is an elastic stream processing engine.
C-Stream encompasses three unique properties. First, in
contrast to the widely adopted event-based interface
for developing streaming operators, C-Stream provides
an interface wherein each operator has its own driver
loop and relies on data availability application
programming interfaces (APIs) to decide when to perform
its computations. This self-control-based model
significantly simplifies the development of operators
that require multiport synchronization. Second,
C-Stream contains a dynamic scheduler that manages the
multithreaded execution of the operators. The
scheduler, which is customizable via plug-ins, enables
the execution of the operators as co-routines, using
any number of threads. The base scheduler implements
back-pressure, provides data availability APIs, and
manages preemption and termination handling. Last,
C-Stream varies the degree of parallelism to resolve
bottlenecks by both dynamically changing the number of
threads used to execute an application and adjusting
the number of replicas of data-parallel operators. We
provide an experimental evaluation of C-Stream. The
results show that C-Stream is scalable, highly
customizable, and can resolve bottlenecks by
dynamically adjusting the level of data parallelism
used.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Agrawal:2018:ISI,
author = "Kunal Agrawal and I-Ting Angelina Lee and Michael
Spear",
title = "Introduction to Special Issue on {SPAA'15}",
journal = j-TOPC,
volume = "4",
number = "4",
pages = "16:1--16:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3226041",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Ahn:2018:ADN,
author = "Kook Jin Ahn and Sudipto Guha",
title = "Access to Data and Number of Iterations: Dual Primal
Algorithms for Maximum Matching under Resource
Constraints",
journal = j-TOPC,
volume = "4",
number = "4",
pages = "17:1--17:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3154855",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "In this article, we consider graph algorithms in
models of computation where the space usage (random
accessible storage, in addition to the read-only input)
is sublinear in the number of edges $m$ and the access
to input is constrained. These questions arise in many
natural settings, and in particular in the analysis of
streaming algorithms, MapReduce or similar algorithms,
or message passing distributed computing that model
constrained parallelism with sublinear central
processing. We focus on weighted nonbipartite maximum
matching in this article. For any constant $ p > 1$, we
provide an iterative sampling-based algorithm for
computing a $ (1 - \epsilon)$-approximation of the
weighted nonbipartite maximum matching that uses $ O(p
/ \epsilon)$ rounds of sampling, and $ O(n^{1 + 1 /
p})$ space. The results extend to $b$-Matching with
small changes. This article combines adaptive sketching
literature and fast primal-dual algorithms based on
relaxed Dantzig--Wolfe decision procedures. Each round
of sampling is implemented through linear sketches and
can be executed in a single round of streaming or two
rounds of MapReduce. The article also proves that
nonstandard linear relaxations of a problem, in
particular penalty-based formulations, are helpful in
reducing the adaptive dependence of the iterations.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Alistarh:2018:TAS,
author = "Dan Alistarh and William Leiserson and Alexander
Matveev and Nir Shavit",
title = "{ThreadScan}: Automatic and Scalable Memory
Reclamation",
journal = j-TOPC,
volume = "4",
number = "4",
pages = "18:1--18:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3201897",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "The concurrent memory reclamation problem is that of
devising a way for a deallocating thread to verify that
no other concurrent threads hold references to a memory
block being deallocated. To date, in the absence of
automatic garbage collection, there is no satisfactory
solution to this problem; existing tracking methods
like hazard pointers, reference counters, or
epoch-based techniques like RCU are either
prohibitively expensive or require significant
programming expertise to the extent that implementing
them efficiently can be worthy of a publication. None
of the existing techniques are automatic or even
semi-automated. In this article, we take a new approach
to concurrent memory reclamation. Instead of manually
tracking access to memory locations as done in
techniques like hazard pointers, or restricting shared
accesses to specific epoch boundaries as in RCU, our
algorithm, called ThreadScan, leverages operating
system signaling to automatically detect which memory
locations are being accessed by concurrent threads.
Initial empirical evidence shows that ThreadScan scales
surprisingly well and requires negligible programming
effort beyond the standard use of Malloc and Free.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Dimitrov:2018:RDT,
author = "Dimitar Dimitrov and Martin Vechev and Vivek Sarkar",
title = "Race Detection in Two Dimensions",
journal = j-TOPC,
volume = "4",
number = "4",
pages = "19:1--19:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3264618",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Dynamic race detection is a program analysis technique
for detecting errors caused by undesired interleavings
of concurrent tasks. A primary challenge when designing
efficient race detection algorithms is to achieve
manageable space requirements. State-of-the-art
algorithms for unstructured parallelism require $
\Theta (n) $ space per monitored memory location, where
n is the total number of tasks. This is a serious
drawback when analyzing programs with many tasks. In
contrast, algorithms for programs with a
series-parallel (SP) structure require only $ \Theta
(1) $ space. Unfortunately, it is currently not well
understood if there are classes of parallelism beyond
SP that can also benefit from and be analyzed with $
\Theta (1) $ space complexity. In this work, we show
that structures richer than SP graphs, namely, that of
two-dimensional (2D) lattices, can also be analyzed in
$ \Theta (1) $ space. Toward that (a) we extend
Tarjan's algorithm for finding lowest common ancestors
to handle 2D lattices; (b) from that extension we
derive a serial algorithm for race detection that can
analyze arbitrary task graphs with a 2D lattice
structure; (c) we present a restriction to fork-join
that admits precisely the 2D lattices as task graphs
(e.g., it can express pipeline parallelism). Our work
generalizes prior work on structured race detection and
aims to provide a deeper understanding of the interplay
between structured parallelism and program analysis
efficiency.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Lee:2018:ERD,
author = "I-Ting Angelina Lee and Tao B. Schardl",
title = "Efficient Race Detection for Reducer Hyperobjects",
journal = j-TOPC,
volume = "4",
number = "4",
pages = "20:1--20:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3205914",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:25 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "A multithreaded Cilk program that is ostensibly
deterministic may nevertheless behave
nondeterministically due to programming errors in the
code. For a Cilk program that uses reducers-a general
reduction mechanism supported in various Cilk
dialects-such programming errors are especially
challenging to debug, because the errors can expose the
nondeterminism in how the Cilk runtime system manages
reducers. We identify two unique types of races that
arise from incorrect use of reducers in a Cilk program,
and we present two algorithms to catch these races. The
first algorithm, called the Peer-Set algorithm, detects
view-read races, which occur when the program attempts
to retrieve a value out of a reducer when the read may
result in a nondeterministic value, such as before all
previously spawned subcomputations that might update
the reducer have necessarily returned. The second
algorithm, called the SP+ algorithm, detects
determinacy races-instances where a write to a memory
location occurs logically in parallel with another
access to that location-even when the raced-on memory
locations relate to reducers. Both algorithms are
provably correct, asymptotically efficient, and can be
implemented efficiently in practice. We have
implemented both algorithms in our prototype race
detector, Rader. When running Peer-Set, Rader incurs a
geometric-mean multiplicative overhead of 2.56 over
running the benchmark without instrumentation. When
running SP+, Rader incurs a geometric-mean
multiplicative overhead of 16.94.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Gilbert:2018:ISI,
author = "Seth Gilbert",
title = "Introduction to the Special Issue for {SPAA 2016}",
journal = j-TOPC,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3230677",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Mitzenmacher:2018:BBC,
author = "Michael Mitzenmacher and Rajmohan Rajaraman and Scott
Roche",
title = "Better Bounds for Coalescing-Branching Random Walks",
journal = j-TOPC,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209688",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Coalescing-branching random walks, or cobra walks for
short, are a natural variant of random walks on graphs
that can model the spread of disease through contacts
or the spread of information in networks. In a k -cobra
walk, at each timestep, a subset of the vertices are
active; each active vertex chooses k random neighbors
(sampled independently and uniformly with replacement)
that become active at the next step, and these are the
only active vertices at the next step. A natural
quantity to study for cobra walks is the cover time,
which corresponds to the expected time when all nodes
have become infected or received the disseminated
information. In this article, we extend previous
results for cobra walks in multiple ways. We show that
the cover time for the 2-cobra walk on $ [0, n]^d $ is
$ O(n) $ (where the order notation hides constant
factors that depend on $d$); previous work had shown
the cover time was $ O(n \cdot \polylog (n))$. We show
that the cover time for a 2-cobra walk on an $n$-vertex
$d$ regular graph with conductance $ \phi_G$ is $ O(d^4
\phis^{-2}_G \log^2 n)$, significantly generalizing a
previous result that held only for expander graphs with
sufficiently high expansion. And, finally, we show that
the cover time for a 2-cobra walk on a graph with n
vertices and m edges is always $ O(m n^{3 / 4} \log
n)$; this is the first result showing that the bound of
$ \Theta (n^3)$ for the worst-case cover time for
random walks can be beaten using 2-cobra walks.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Liu:2018:RAN,
author = "Mingmou Liu and Xiaoyin Pan and Yitong Yin",
title = "Randomized Approximate Nearest Neighbor Search with
Limited Adaptivity",
journal = j-TOPC,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209884",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We study the complexity of parallel data structures
for approximate nearest neighbor search in
$d$-dimensional Hamming space $ \{ 0, 1 \}^d$. A
classic model for static data structures is the
cell-probe model [27]. We consider a cell-probe model
with limited adaptivity, where given a $ k \geq 1$, a
query is resolved by making at most $k$ rounds of
parallel memory accesses to the data structure. We give
two randomized algorithms that solve the approximate
nearest neighbor search using $k$ rounds of parallel
memory accesses: --- a simple algorithm with $ O(k
(\log d)^{1 / k})$ total number of memory accesses for
all $ k \geq 1$ --- an algorithm with $ O(k + (1 / k
\log d)^{O(1 / k)})$ total number of memory accesses
for all sufficiently large $k$. Both algorithms use
data structures of polynomial size. We prove an $
\Omega (1 / k (\log d)^{1 / k})$ lower bound for the
total number of memory accesses for any randomized
algorithm solving the approximate nearest neighbor
search within $ k \leq \log \log d / 2 \log \log \log
d$ rounds of parallel memory accesses on any data
structures of polynomial size. This lower bound shows
that our first algorithm is asymptotically optimal when
$ k = O(1)$. And our second algorithm achieves the
asymptotically optimal tradeoff between number of
rounds and total number of memory accesses. In the
extremal case, when $ k = O(\log \log d / \log \log
\log d)$ is big enough, our second algorithm matches
the $ \Theta (\log \log d / \log \log \log d)$ tight
bound for fully adaptive algorithms for approximate
nearest neighbor search in [11].",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Pandurangan:2018:FDA,
author = "Gopal Pandurangan and Peter Robinson and Michele
Scquizzato",
title = "Fast Distributed Algorithms for Connectivity and {MST}
in Large Graphs",
journal = j-TOPC,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209689",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Motivated by the increasing need to understand the
algorithmic foundations of distributed large-scale
graph computations, we study a number of fundamental
graph problems in a message-passing model for
distributed computing where $ k \geq 2 $ machines
jointly perform computations on graphs with $n$ nodes
(typically, $ n \gg k$). The input graph is assumed to
be initially randomly partitioned among the $k$
machines, a common implementation in many real-world
systems. Communication is point-to-point, and the goal
is to minimize the number of communication rounds of
the computation. Our main result is an (almost) optimal
distributed randomized algorithm for graph
connectivity. Our algorithm runs in $ {\~ O}(n / k^2)$
rounds ($ {\~ O}$ notation hides a $ \polylog (n)$
factor and an additive $ \polylog (n)$ term). This
improves over the best previously known bound of $ {\~
O}(n / k)$ [Klauck et al., SODA 2015] and is optimal
(up to a polylogarithmic factor) in light of an
existing lower bound of $ \Omega^~(n / k^2)$. Our
improved algorithm uses a bunch of techniques,
including linear graph sketching, that prove useful in
the design of efficient distributed graph algorithms.
Using the connectivity algorithm as a building block,
we then present fast randomized algorithms for
computing minimum spanning trees, (approximate)
min-cuts, and for many graph verification problems. All
these algorithms take $ {\~ O}(n / k^2)$ rounds and are
optimal up to polylogarithmic factors. We also show an
almost matching lower bound of $ \Omega^~(n / k^2)$
rounds for many graph verification problems by
leveraging lower bounds in random-partition
communication complexity.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Korupolu:2018:RPF,
author = "Madhukar Korupolu and Rajmohan Rajaraman",
title = "Robust and Probabilistic Failure-Aware Placement",
journal = j-TOPC,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3210367",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Motivated by the growing complexity and heterogeneity
of modern data centers, and the prevalence of commodity
component failures, this article studies the
failure-aware placement problem of placing tasks of a
parallel job on machines in the data center with the
goal of increasing availability. We consider two models
of failures: adversarial and probabilistic. In the
adversarial model, each node has a weight (higher
weight implying higher reliability) and the adversary
can remove any subset of nodes of total weight at most
a given bound W and our goal is to find a placement
that incurs the least disruption against such an
adversary. In the probabilistic model, each node has a
probability of failure and we need to find a placement
that maximizes the probability that at least K out of N
tasks survive at any time. For adversarial failures, we
first show that (i) the problems are in $ \Sigma_2 $,
the second level of the polynomial hierarchy; (ii) a
variant of the problem that we call RobustFap (for
Robust Failure-Aware Placement) is co-NP-hard; and
(iii) an all-or-nothing version of RobustFap is $
\Sigma_2$-complete. We then give a polynomial-time
approximation scheme (PTAS) for RobustFap, a key
ingredient of which is a solution that we design for a
fractional version of RobustFap. We then study
HierRobustFap, which is the fractional RobustFap
problem over a hierarchical network, in which failures
can occur at any subset of nodes in the hierarchy, and
a failure at a node can adversely impact all of its
descendants in the hierarchy. To solve HierRobustFap,
we introduce a notion of hierarchical max-min fairness
and a novel Generalized Spreading algorithm, which is
simultaneously optimal for every upper bound W on the
total weight of nodes that an adversary can fail. These
generalize the classical notion of max-min fairness to
work with nodes of differing capacities, differing
reliability weights, and hierarchical structures. Using
randomized rounding, we extend this to give an
algorithm for integral HierRobustFap. For the
probabilistic version, we first give an algorithm that
achieves an additive $ \epsilon $ approximation in the
failure probability for the single level version,
called ProbFap, while giving up a $ (1 + \epsilon)$
multiplicative factor in the number of failures. We
then extend the result to the hierarchical version,
HierProbFap, achieving an \epsilon additive
approximation in failure probability while giving up an
$ (L + \epsilon)$ multiplicative factor in the number
of failures, where $L$ is the number of levels in the
hierarchy.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Zhang:2018:LFT,
author = "Deli Zhang and Pierre Laborde and Lance Lebanoff and
Damian Dechev",
title = "Lock-Free Transactional Transformation for Linked Data
Structures",
journal = j-TOPC,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = sep,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3209690",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/hash.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Nonblocking data structures allow scalable and
thread-safe access to shared data. They provide
individual operations that appear to execute
atomically. However, it is often desirable to execute
multiple operations atomically in a transactional
manner. Previous solutions, such as Software
Transactional Memory (STM) and transactional boosting,
manage transaction synchronization separately from the
underlying data structure's thread synchronization.
Although this reduces programming effort, it leads to
overhead associated with additional synchronization and
the need to rollback aborted transactions. In this
work, we present a new methodology for transforming
high-performance lock-free linked data structures into
high-performance lock-free transactional linked data
structures without revamping the data structures'
original synchronization design. Our approach leverages
the semantic knowledge of the data structure to
eliminate the overhead of false conflicts and
rollbacks. We encapsulate all operations, operands, and
transaction status in a transaction descriptor, which
is shared among the nodes accessed by the same
transaction. We coordinate threads to help finish the
remaining operations of delayed transactions based on
their transaction descriptors. When a transaction
fails, we recover the correct abstract state by
reversely interpreting the logical status of a node. We
also present an obstruction-free version of our
algorithm that can be applied to dynamic execution
scenarios and an example of our approach applied to a
hash map. In our experimental evaluation using
transactions with randomly generated operations, our
lock-free transactional data structures outperform the
transactional boosted ones by 70\% on average. They
also outperform the alternative STM-based approaches by
a factor of 2 to 13 across all scenarios. More
importantly, we achieve 4,700 to 915,000 times fewer
spurious aborts than the alternatives.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Muller:2018:NHP,
author = "Michel M{\"u}ller and Takayuki Aoki",
title = "New High Performance {GPGPU} Code Transformation
Framework Applied to Large Production Weather
Prediction Code",
journal = j-TOPC,
volume = "5",
number = "2",
pages = "7:1--7:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291523",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We introduce ``Hybrid Fortran,'' a new approach that
allows a high-performance GPGPU port for structured
grid Fortran codes. This technique only requires
minimal changes for a CPU targeted codebase, which is a
significant advancement in terms of productivity. It
has been successfully applied to both dynamical core
and physical processes of ASUCA, a Japanese mesoscale
weather prediction model with more than 150k lines of
code. By means of a minimal weather application that
resembles ASUCA's code structure, Hybrid Fortran is
compared to both a performance model as well as today's
commonly used method, OpenACC. As a result, the Hybrid
Fortran implementation is shown to deliver the same or
better performance than OpenACC, and its performance
agrees with the model both on CPU and GPU. In a
full-scale production run, using an ASUCA grid with
1581 $ \times $ 1301 $ \times $ 58 cells and real-world
weather data in 2km resolution, 24 NVIDIA Tesla P100
running the Hybrid Fortran-based GPU port are shown to
replace more than fifty 18-core Intel Xeon Broadwell
E5-2695 v4 running the reference implementation-an
achievement comparable to more invasive GPGPU rewrites
of other weather models.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Burtscher:2018:HQF,
author = "Martin Burtscher and Sindhu Devale and Sahar Azimi and
Jayadharini Jaiganesh and Evan Powers",
title = "A High-Quality and Fast Maximal Independent Set
Implementation for {GPUs}",
journal = j-TOPC,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291525",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Computing a maximal independent set is an important
step in many parallel graph algorithms. This article
introduces ECL-MIS, a maximal independent set
implementation that works well on GPUs. It includes key
optimizations to speed up computation, reduce the
memory footprint, and increase the set size. Its CUDA
implementation requires fewer than 30 kernel
statements, runs asynchronously, and produces a
deterministic result. It outperforms the maximal
independent set implementations of Pannotia, CUSP, and
IrGL on each of the 16 tested graphs of various types
and sizes. On a Titan X GPU, ECL-MIS is between 3.9 and
100 times faster (11.5 times, on average). ECL-MIS
running on the GPU is also faster than the parallel CPU
codes Ligra, Ligra+, and PBBS running on 20 Xeon cores,
which it outperforms by 4.1 times, on average. At the
same time, ECL-MIS produces maximal independent sets
that are up to 52\% larger (over 10\%, on average)
compared to these preexisting CPU and GPU
implementations. Whereas these codes produce maximal
independent sets that are, on average, about 15\%
smaller than the largest possible such sets, ECL-MIS
sets are less than 6\% smaller than the maximum
independent sets.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Liu:2018:APR,
author = "Junhong Liu and Guangming Tan and Yulong Luo and
Jiajia Li and Zeyao Mo and Ninghui Sun",
title = "An Autotuning Protocol to Rapidly Build Autotuners",
journal = j-TOPC,
volume = "5",
number = "2",
pages = "9:1--9:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291527",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Automatic performance tuning (Autotuning) is an
increasingly critical tuning technique for the high
portable performance of Exascale applications. However,
constructing an autotuner from scratch remains a
challenge, even for domain experts. In this work, we
propose a performance tuning and knowledge management
suite (PAK) to help rapidly build autotuners. In order
to accommodate existing autotuning techniques, we
present an autotuning protocol that is composed of an
extractor, producer, optimizer, evaluator, and learner.
To achieve modularity and reusability, we also define
programming interfaces for each protocol component as
the fundamental infrastructure, which provides a
customizable mechanism to deploy knowledge mining in
the performance database. PAK's usability is
demonstrated by studying two important computational
kernels: stencil computation and sparse matrix-vector
multiplication (SpMV). Our proposed autotuner based on
PAK shows comparable performance and higher
productivity than traditional autotuners by writing
just a few tens of code using our autotuning
protocol.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Anta:2018:SDP,
author = "Antonio Fern{\'a}ndez Anta and Dariusz R. Kowalski and
Miguel A. Mosteiro and Prudence W. H. Wong",
title = "Scheduling Dynamic Parallel Workload of Mobile Devices
with Access Guarantees",
journal = j-TOPC,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3291529",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "We study a dynamic resource-allocation problem that
arises in various parallel computing scenarios, such as
mobile cloud computing, cloud computing systems,
Internet of Things systems, and others. Generically, we
model the architecture as client mobile devices and
static base stations. Each client ``arrives'' to the
system to upload data to base stations by radio
transmissions and then ``leaves.'' The problem, called
Station Assignment, is to assign clients to stations so
that every client uploads their data under some
restrictions, including a target subset of stations, a
maximum delay between transmissions, a volume of data
to upload, and a maximum bandwidth for each station. We
study the solvability of Station Assignment under an
adversary that controls the arrival and departure of
clients, limited to maximum rate and burstiness of such
arrivals. We show upper and lower bounds on the rate
and burstiness for various client arrival schedules and
protocol classes. To the best of our knowledge, this is
the first time that Station Assignment is studied under
adversarial arrivals and departures.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Mirhosseini:2018:BBA,
author = "Amirhossein Mirhosseini and Mohammad Sadrosadati and
Fatemeh Aghamohammadi and Mehdi Modarressi and Hamid
Sarbazi-Azad",
title = "{BARAN}: Bimodal Adaptive Reconfigurable-Allocator
Network-on-Chip",
journal = j-TOPC,
volume = "5",
number = "3",
pages = "11:1--11:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3294049",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Virtual channels are employed to improve the
throughput under high traffic loads in
Networks-on-Chips (NoCs). However, they can impose
non-negligible overheads on performance by prolonging
clock cycle time, especially under low traffic loads
where the impact of virtual channels on performance is
trivial. In this article, we propose a novel
architecture, called BARAN, that can either improve
on-chip network performance or reduce its power
consumption (depending on the specific implementation
chosen), not both at the same time, when virtual
channels are underutilized; that is, the average number
of virtual channel allocation requests per cycle is
lower than the number of total virtual channels. We
also introduce a reconfigurable arbitration logic
within the BARAN architecture that can be configured to
have multiple latencies and, hence, multiple slack
times. The increased slack times are then used to
reduce the supply voltage of the routers or increase
their clock frequency in order to reduce power
consumption or improve the performance of the whole NoC
system. The power-centric design of BARAN reduces NoC
power consumption by 43.4\% and 40.6\% under CMP and
GPU workloads, on average, respectively, compared to a
baseline architecture while imposing negligible area
and performance overheads. The performance-centric
design of BARAN reduces the average packet latency by
45.4\% and 42.1\%, on average, under CMP and GPU
workloads, respectively, compared to the baseline
architecture while increasing power consumption by
39.7\% and 43.7\%, on average. Moreover, the
performance-centric BARAN postpones the network
saturation rate by 11.5\% under uniform random traffic
compared to the baseline architecture.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Amer:2018:LCM,
author = "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and
Milind Chabbi and Yanjie Wei and Jeff Hammond and
Satoshi Matsuoka",
title = "Lock Contention Management in Multithreaded {MPI}",
journal = j-TOPC,
volume = "5",
number = "3",
pages = "12:1--12:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3275443",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
http://www.math.utah.edu/pub/tex/bib/pvm.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3275443",
abstract = "In this article, we investigate contention management
in lock-based thread-safe MPI libraries. Specifically,
we make two assumptions: (1) locks are the only form of
synchronization when protecting communication paths;
and (2) contention occurs, and thus serialization is
unavoidable. Our work distinguishes between lock
acquisitions with respect to work being performed
inside a critical section; productive vs. unproductive.
Waiting for message reception without doing anything
else inside a critical section is an example of
unproductive lock acquisition. We show that the
high-throughput nature of modern scalable locking
protocols translates into better communication progress
for throughput-intensive MPI communication but
negatively impacts latency-sensitive communication
because of overzealous unproductive lock acquisition.
To reduce unproductive lock acquisitions, we devised a
method that promotes threads with productive work using
a generic two-level priority locking protocol. Our
results show that using a high-throughput protocol for
productive work and a fair protocol for less productive
code paths ensures the best tradeoff for fine-grained
communication, whereas a fair protocol is sufficient
for more coarse-grained communication. Although these
efforts have been rewarding, scalability degradation
remains significant. We discuss techniques that diverge
from the pure locking model and offer the potential to
further improve scalability.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Chen:2018:PDG,
author = "Rong Chen and Jiaxin Shi and Yanzhe Chen and Binyu
Zang and Haibing Guan and Haibo Chen",
title = "{PowerLyra}: Differentiated Graph Computation and
Partitioning on Skewed Graphs",
journal = j-TOPC,
volume = "5",
number = "3",
pages = "13:1--13:??",
month = jan,
year = "2018",
CODEN = "????",
DOI = "https://doi.org/10.1145/3298989",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Wed Jan 23 16:12:26 MST 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
abstract = "Natural graphs with skewed distributions raise unique
challenges to distributed graph computation and
partitioning. Existing graph-parallel systems usually
use a ``one-size-fits-all'' design that uniformly
processes all vertices, which either suffer from
notable load imbalance and high contention for
high-degree vertices (e.g., Pregel and GraphLab) or
incur high communication cost and memory consumption
even for low-degree vertices (e.g., PowerGraph and
GraphX). In this article, we argue that skewed
distributions in natural graphs also necessitate
differentiated processing on high-degree and low-degree
vertices. We then introduce PowerLyra, a new
distributed graph processing system that embraces the
best of both worlds of existing graph-parallel systems.
Specifically, PowerLyra uses centralized computation
for low-degree vertices to avoid frequent
communications and distributes the computation for
high-degree vertices to balance workloads. PowerLyra
further provides an efficient hybrid graph partitioning
algorithm (i.e., hybrid-cut) that combines edge-cut
(for low-degree vertices) and vertex-cut (for
high-degree vertices) with heuristics. To improve cache
locality of inter-node graph accesses, PowerLyra
further provides a locality-conscious data layout
optimization. PowerLyra is implemented based on the
latest GraphLab and can seamlessly support various
graph algorithms running in both synchronous and
asynchronous execution modes. A detailed evaluation on
three clusters using various graph-analytics and MLDM
(Machine Learning and Data Mining) applications shows
that PowerLyra outperforms PowerGraph by up to 5.53X
(from 1.24X) and 3.26X (from 1.49X) for real-world and
synthetic graphs, respectively, and is much faster than
other systems like GraphX and Giraph, yet with much
less memory consumption. A porting of hybrid-cut to
GraphX further confirms the efficiency and generality
of PowerLyra.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Aravind:2019:GME,
author = "Alex Aravind and Wim H. Hesselink",
title = "Group Mutual Exclusion by Fetch-and-increment",
journal = j-TOPC,
volume = "5",
number = "4",
pages = "14:1--14:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3309202",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Mar 11 18:54:51 MDT 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3309202\&ftid=2042588\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
abstract = "The group mutual exclusion (GME) problem (also called
the room synchronization problem) arises in various
practical applications that require concurrent data
sharing. Group mutual exclusion aims to achieve
exclusive access to a shared resource (a shared room)
while facilitating concurrency among non-conflicting
requests. The problem is that threads with distinct
interests are not allowed to access the shared resource
concurrently, but multiple threads with same interest
can. In Blelloch et al. (2003), the authors presented a
simple solution to the room synchronization problem
using fetch8add (F8A) and test-and-set (T8S) atomic
operations. This algorithm has $O(m)$ remote memory
references (RMRs) in the cache coherent (CC) model,
where $m$ is the number of forums. In Bhatt and Huang
(2010), an open problem was posed: `` Is it possible to
design a GME algorithm with constant RMR for the CC
model using fetch8add instructions? '' This question is
partially answered in this article by presenting a
group mutual exclusion algorithm using
fetch-and-increment instructions. The algorithm is
simple and scalable.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Behzad:2019:OPH,
author = "Babak Behzad and Surendra Byna and Prabhat and Marc
Snir",
title = "Optimizing {I/O} Performance of {HPC} Applications
with Autotuning",
journal = j-TOPC,
volume = "5",
number = "4",
pages = "15:1--15:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3309205",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Mar 11 18:54:51 MDT 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3309205\&ftid=2044223\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
abstract = "Parallel Input output is an essential component of
modern high-performance computing (HPC). Obtaining good
I/O performance for a broad range of applications on
diverse HPC platforms is a major challenge, in part,
because of complex inter dependencies between I/O
middleware and hardware. The parallel file system and
I/O middleware layers all offer optimization parameters
that can, in theory, result in better I/O performance.
Unfortunately, the right combination of parameters is
highly dependent on the application, HPC platform,
problem size, and concurrency. Scientific application
developers do not have the time or expertise to take on
the substantial burden of identifying good parameters
for each problem configuration. They resort to using
system defaults, a choice that frequently results in
poor I/O performance. We expect this problem to be
compounded on exascale-class machines, which will
likely have a deeper software stack with hierarchically
arranged hardware resources. We present as a solution
to this problem an autotuning system for optimizing I/O
performance, I/O performance modeling, I/O tuning, and
I/O patterns. We demonstrate the value of this
framework across several HPC platforms and applications
at scale.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Maier:2019:CHT,
author = "Tobias Maier and Peter Sanders and Roman Dementiev",
title = "Concurrent Hash Tables: Fast and General(?)!",
journal = j-TOPC,
volume = "5",
number = "4",
pages = "16:1--16:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3309206",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Mar 11 18:54:51 MDT 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/hash.bib;
http://www.math.utah.edu/pub/tex/bib/topc.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3309206\&ftid=2042589\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
abstract = "Concurrent hash tables are one of the most important
concurrent data structures, which are used in numerous
applications. For some applications, it is common that
hash table accesses dominate the execution time. To
efficiently solve these problems in parallel, we need
implementations that achieve speedups in highly
concurrent scenarios. Unfortunately, currently
available concurrent hashing libraries are far away
from this requirement, in particular, when adaptively
sized tables are necessary or contention on some
elements occurs. Our starting point for better
performing data structures is a fast and simple
lock-free concurrent hash table based on linear probing
that is, however, limited to word-sized key-value types
and does not support dynamic size adaptation. We
explain how to lift these limitations in a provably
scalable way and demonstrate that dynamic growing has a
performance overhead comparable to the same
generalization in sequential hash tables. We perform
extensive experiments comparing the performance of our
implementations with six of the most widely used
concurrent hash tables. Ours are considerably faster
than the best algorithms with similar restrictions and
an order of magnitude faster than the best more general
tables. In some extreme cases, the difference even
approaches four orders of magnitude. All our
implementations discussed in this publication can be
found on github [17].",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}
@Article{Cruz:2019:ETM,
author = "Eduardo H. M. Cruz and Matthias Diener and La{\'e}rcio
L. Pilla and Philippe O. A. Navaux",
title = "{EagerMap}: a Task Mapping Algorithm to Improve
Communication and Load Balancing in Clusters of
Multicore Systems",
journal = j-TOPC,
volume = "5",
number = "4",
pages = "17:1--17:??",
month = mar,
year = "2019",
CODEN = "????",
DOI = "https://doi.org/10.1145/3309711",
ISSN = "2329-4949 (print), 2329-4957 (electronic)",
ISSN-L = "2329-4949",
bibdate = "Mon Mar 11 18:54:51 MDT 2019",
bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib",
URL = "https://dl.acm.org/ft_gateway.cfm?id=3309711\&ftid=2044225\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
abstract = "Communication between tasks and load imbalance have
been identified as a major challenge for the
performance and energy efficiency of parallel
applications. A common way to improve communication is
to increase its locality, that is, to reduce the
distances of data transfers, prioritizing the usage of
faster and more efficient local interconnections over
remote ones. Regarding load imbalance, cores should
execute a similar amount of work. An important problem
to be solved in this context is how to determine an
optimized mapping of tasks to cluster nodes and cores
that increases the overall locality and load balancing.
In this article, we propose the EagerMap algorithm to
determine task mappings, which is based on a greedy
heuristic to match application communication patterns
to hardware hierarchies and which can also consider the
task load. Compared to previous algorithms, EagerMap is
faster, scales better, and supports more types of
computer systems, while maintaining the same or better
quality of the determined task mapping. EagerMap is
therefore an interesting choice for task mapping on a
variety of modern parallel architectures.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "ACM Transactions on Parallel Computing",
journal-URL = "http://dl.acm.org/citation.cfm?id=2632163",
}