%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.12", %%% date = "11 March 2019", %%% time = "18:58:03 MST", %%% filename = "topc.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "00952 5174 29538 275848", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "ACM Transactions on Parallel Computing %%% (TOPC); bibliography; BibTeX", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Parallel Computing (TOPC) %%% (CODEN ????, ISSN 2329-4949 (print), %%% 2329-4957 (electronic)). The journal appears %%% quarterly, and publication began with volume %%% 1, number 1, in September 2014. %%% %%% At version 1.12, the COMPLETE journal %%% coverage looked like this: %%% %%% 2014 ( 8) 2016 ( 25) 2018 ( 22) %%% 2015 ( 29) 2017 ( 16) 2019 ( 4) %%% %%% Article: 104 %%% %%% Total entries: 104 %%% %%% The journal Web page can be found at: %%% %%% http://topc.acm.org/ %%% %%% The journal table of contents page is at: %%% %%% http://dl.acm.org/pub.cfm?id=J1496 %%% http://dl.acm.org/citation.cfm?id=2632163 %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ====================================================================

@Preamble{"\input bibnames.sty" # "\def \TM {${}^{\sc TM}$}" }

%%% ==================================================================== %%% Acknowledgement abbreviations:

@String{ack-nhfb= "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ==================================================================== %%% Journal abbreviations:

@String{j-TOPC= "ACM Transactions on Parallel Computing (TOPC)"}

%%% ==================================================================== %%% Bibliography entries:

@Article{Gibbons:2014:ATP, author = "Phillip B. Gibbons", title = "{ACM Transactions on Parallel Computing}: an introduction", journal = j-TOPC, volume = "1", number = "1", pages = "1:1--1:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2661651", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Techniques", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Lilja:2014:I, author = "David J. Lilja", title = "Introduction", journal = j-TOPC, volume = "1", number = "1", pages = "2:1--2:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2609798", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Rane:2014:EPO, author = "Ashay Rane and James Browne", title = "Enhancing Performance Optimization of Multicore\slash Multichip Nodes with Data Structure Metrics", journal = j-TOPC, volume = "1", number = "1", pages = "3:1--3:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2588788", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Program performance optimization is usually based solely on measurements of execution behavior of code segments using hardware performance counters. However, memory access patterns are critical performance limiting factors for today's multicore chips where performance is highly memory bound. Therefore diagnoses and selection of optimizations based only on measurements of the execution behavior of code segments are incomplete because they do not incorporate knowledge of memory access patterns and behaviors. This article presents a low-overhead tool (MACPO) that captures memory traces and computes metrics for the memory access behavior of source-level (C, C++, Fortran) data structures. MACPO explicitly targets the measurement and metrics important to performance optimization for multicore chips. The article also presents a complete process for integrating measurement and analyses of code execution with measurements and analyses of memory access patterns and behaviors for performance optimization, specifically targeting multicore chips and multichip nodes of clusters. MACPO uses more realistic cache models for computation of latency metrics than those used by previous tools. Evaluation of the effectiveness of adding memory access behavior characteristics of data structures to performance optimization was done on subsets of the ASCI, NAS and Rodinia parallel benchmarks and two versions of one application program from a domain not represented in these benchmarks. Adding characteristics of the behavior of data structures enabled easier diagnoses of bottlenecks and more accurate selection of appropriate optimizations than with only code centric behavior measurements. The performance gains ranged from a few percent to 38 percent.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Jimenez:2014:APP, author = "V{\'\i}ctor Jim{\'e}nez and Francisco J. Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu and Pradip Bose and Francis P. O'Connell and Bruce G. Mealey", title = "Adaptive Prefetching on {POWER7}: Improving Performance and Power Consumption", journal = j-TOPC, volume = "1", number = "1", pages = "4:1--4:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2588889", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Hardware data prefetch engines are integral parts of many general purpose server-class microprocessors in the field today. Some prefetch engines allow users to change some of their parameters. But, the prefetcher is usually enabled in a default configuration during system bring-up, and dynamic reconfiguration of the prefetch engine is not an autonomic feature of current machines. Conceptually, however, it is easy to infer that commonly used prefetch algorithms---when applied in a fixed mode---will not help performance in many cases. In fact, they may actually degrade performance due to useless bus bandwidth consumption and cache pollution, which in turn, will also waste power. We present an adaptive prefetch scheme that dynamically modifies the prefetch settings in order to adapt to workloads' requirements. We use a commercial processor, namely the IBM POWER7 as a vehicle for our study. First we characterize---in terms of performance and power consumption---the prefetcher in that processor using microbenchmarks and SPEC CPU2006. We then present our adaptive prefetch mechanism showing performance improvements with respect to the default prefetch setting up to 2.7X and 1.3X for single-threaded and multiprogrammed workloads, respectively. Adaptive prefetching is also able to reduce power consumption in some cases. Finally, we also evaluate our mechanism with SPECjbb2005, improving both performance and power consumption.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Heil:2014:APH, author = "Timothy Heil and Anil Krishna and Nicholas Lindberg and Farnaz Toussi and Steven Vanderwiel", title = "Architecture and Performance of the Hardware Accelerators in {IBM}'s {PowerEN} Processor", journal = j-TOPC, volume = "1", number = "1", pages = "5:1--5:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2588888", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Computation at the edge of a datacenter has unique characteristics. It deals with streaming data from multiple sources, going to multiple destinations, often requiring repeated application of one or more of several standard algorithmic kernels. These kernels, related to encryption, compression, XML Parsing and regular expression searching on the data, demand a high data processing rate and power efficiency. This suggests the use of hardware acceleration for key functions. However, robust general purpose processing support is necessary to orchestrate the flow of data between accelerators, as well as perform tasks that are not suited to acceleration. Further, these accelerators must be tightly integrated with the general purpose computation in order to keep invocation overhead and latency low. The accelerators must be easy for software to use, and the system must be flexible enough to support evolving networking standards. In this article, we describe and evaluate the architecture of IBM's PowerEN processor, with a focus on PowerEN's architectural enhancements and its on-chip hardware accelerators. PowerEN unites the throughput of application-specific accelerators with the programmability of general purpose cores on a single coherent memory architecture. Hardware acceleration improves throughput by orders of magnitude in some cases compared to equivalent computation on the general purpose cores. By offloading work to the accelerators, general purpose cores are freed to simultaneously work on computation less suited to acceleration.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Wu:2014:MAG, author = "Xing Wu and Frank Mueller and Scott Pakin", title = "A methodology for automatic generation of executable communication specifications from parallel {MPI} applications", journal = j-TOPC, volume = "1", number = "1", pages = "6:1--6:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2660249", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Portable parallel benchmarks are widely used for performance evaluation of HPC systems. However, because these are manually produced, they generally represent a greatly simplified view of application behavior, missing the subtle but important-to-performance nuances that may exist in a complete application. This work contributes novel methods to automatically generate highly portable and customizable communication benchmarks from HPC applications. We utilize ScalaTrace, a lossless yet scalable parallel-application tracing framework to collect selected aspects of the run-time behavior of HPC applications, including communication operations and computation time, while abstracting away the details of the computation proper. We subsequently generate benchmarks with nearly identical run-time behavior to the original applications. Results demonstrate that the generated benchmarks are in fact able to preserve the run-time behavior (including both the communication pattern and the execution time) of the original applications. Such automated benchmark generation is without precedent and particularly valuable for proprietary, export-controlled, or classified application codes.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Ravishankar:2014:APC, author = "Mahesh Ravishankar and John Eisenlohr and Louis-No{\"e}l Pouchet and J. Ramanujam and Atanas Rountev and P. Sadayappan", title = "Automatic parallelization of a class of irregular loops for distributed memory systems", journal = j-TOPC, volume = "1", number = "1", pages = "7:1--7:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2660251", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Many scientific applications spend significant time within loops that are parallel, except for dependences from associative reduction operations. However these loops often contain data-dependent control-flow and array-access patterns. Traditional optimizations that rely on purely static analysis fail to generate parallel code in such cases. This article proposes an approach for automatic parallelization for distributed memory environments, using both static and runtime analysis. We formalize the computations that are targeted by this approach and develop algorithms to detect such computations. We also describe algorithms to generate a parallel inspector that performs a runtime analysis of control-flow and array-access patterns, and a parallel executor to take advantage of this information. The effectiveness of the approach is demonstrated on several benchmarks that were automatically transformed using a prototype compiler. For these, the inspector overheads and performance of the executor code were measured. The benefit on real-world applications was also demonstrated through similar manual transformations of an atmospheric modeling software.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Shun:2014:SPC, author = "Julian Shun and Guy E. Blelloch", title = "A simple parallel {Cartesian} tree algorithm and its application to parallel suffix tree construction", journal = j-TOPC, volume = "1", number = "1", pages = "8:1--8:??", month = sep, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2661653", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Oct 17 12:28:03 MDT 2014", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present a simple linear work and space, and polylogarithmic time parallel algorithm for generating multiway Cartesian trees. We show that bottom-up traversals of the multiway Cartesian tree on the interleaved suffix array and longest common prefix array of a string can be used to answer certain string queries. By adding downward pointers in the tree (e.g. using a hash table), we can also generate suffix trees from suffix arrays on arbitrary alphabets in the same bounds. In conjunction with parallel suffix array algorithms, such as the skew algorithm, this gives a rather simple linear work parallel, $O(n \epsilon) $ time $ (0 < \epsilon < 1) $, algorithm for generating suffix trees over an integer alphabet $ \Sigma \{ 1, \ldots {}, n \} $, where $n$ is the length of the input string. It also gives a linear work parallel algorithm requiring $O(\log_2 n) $ time with high probability for constant-sized alphabets. More generally, given a sorted sequence of strings and the longest common prefix lengths between adjacent elements, the algorithm will generate a patricia tree (compacted trie) over the strings. Of independent interest, we describe a work-efficient parallel algorithm for solving the all nearest smaller values problem using Cartesian trees, which is much simpler than the work-efficient parallel algorithm described in previous work. We also present experimental results comparing the performance of the algorithm to existing sequential implementations and a second parallel algorithm that we implement. We present comparisons for the Cartesian tree algorithm on its own and for constructing a suffix tree. The results show that on a variety of strings our algorithm is competitive with the sequential version on a single processor and achieves good speedup on multiple processors. We present experiments for three applications that require only the Cartesian tree, and also for searching using the suffix tree.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Pingali:2015:ISI, author = "Keshav Pingali and J. Ramanujam and P. Sadayappan", title = "Introduction to the Special Issue on {PPoPP'12}", journal = j-TOPC, volume = "1", number = "2", pages = "9:1--9:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2716343", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Bouteiller:2015:ABF, author = "Aurelien Bouteiller and Thomas Herault and George Bosilca and Peng Du and Jack Dongarra", title = "Algorithm-Based Fault Tolerance for Dense Matrix Factorizations, Multiple Failures and Accuracy", journal = j-TOPC, volume = "1", number = "2", pages = "10:1--10:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686892", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Dense matrix factorizations, such as LU, Cholesky and QR, are widely used for scientific applications that require solving systems of linear equations, eigenvalues and linear least squares problems. Such computations are normally carried out on supercomputers, whose ever-growing scale induces a fast decline of the Mean Time To Failure (MTTF). This article proposes a new hybrid approach, based on Algorithm-Based Fault Tolerance (ABFT), to help matrix factorizations algorithms survive fail-stop failures. We consider extreme conditions, such as the absence of any reliable node and the possibility of losing both data and checksum from a single failure. We will present a generic solution for protecting the right factor, where the updates are applied, of all above mentioned factorizations. For the left factor, where the panel has been applied, we propose a scalable checkpointing algorithm. This algorithm features high degree of checkpointing parallelism and cooperatively utilizes the checksum storage leftover from the right factor protection. The fault-tolerant algorithms derived from this hybrid solution is applicable to a wide range of dense matrix factorizations, with minor modifications. Theoretical analysis shows that the fault tolerance overhead decreases inversely to the scaling in the number of computing units and the problem size. Experimental results of LU and QR factorization on the Kraken (Cray XT5) supercomputer validate the theoretical evaluation and confirm negligible overhead, with- and without-errors. Applicability to tolerate multiple failures and accuracy after multiple recovery is also considered.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Ballard:2015:ACS, author = "Grey Ballard and James Demmel and Nicholas Knight", title = "Avoiding Communication in Successive Band Reduction", journal = j-TOPC, volume = "1", number = "2", pages = "11:1--11:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686877", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "The running time of an algorithm depends on both arithmetic and communication (i.e., data movement) costs, and the relative costs of communication are growing over time. In this work, we present sequential and distributed-memory parallel algorithms for tridiagonalizing full symmetric and symmetric band matrices that asymptotically reduce communication compared to previous approaches. The tridiagonalization of a symmetric band matrix is a key kernel in solving the symmetric eigenvalue problem for both full and band matrices. In order to preserve structure, tridiagonalization routines use annihilate-and-chase procedures that previously have suffered from poor data locality and high parallel latency cost. We improve both by reorganizing the computation and obtain asymptotic improvements. We also propose new algorithms for reducing a full symmetric matrix to band form in a communication-efficient manner. In this article, we consider the cases of computing eigenvalues only and of computing eigenvalues and all eigenvectors.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Sack:2015:CAM, author = "Paul Sack and William Gropp", title = "Collective Algorithms for Multiported Torus Networks", journal = j-TOPC, volume = "1", number = "2", pages = "12:1--12:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686882", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Modern supercomputers with torus networks allow each node to simultaneously pass messages on all of its links. However, most collective algorithms are designed to only use one link at a time. In this work, we present novel multiported algorithms for the scatter, gather, all-gather, and reduce-scatter operations. Our algorithms can be combined to create multiported reduce, all-reduce, and broadcast algorithms. Several of these algorithms involve a new technique where we relax the MPI message-ordering constraints to achieve high performance and restore the correct ordering using an additional stage of redundant communication. According to our models, on an $n$-dimensional torus, our algorithms should allow for nearly a $2 n$-fold improvement in communication performance compared to known, single-ported torus algorithms. In practice, we have achieved nearly $6\times$ better performance on a 32k-node 3-dimensional torus.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Dice:2015:LCG, author = "David Dice and Virendra J. Marathe and Nir Shavit", title = "Lock Cohorting: a General Technique for Designing {NUMA} Locks", journal = j-TOPC, volume = "1", number = "2", pages = "13:1--13:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686884", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Multicore machines are quickly shifting to NUMA and CC-NUMA architectures, making scalable NUMA-aware locking algorithms, ones that take into account the machine's nonuniform memory and caching hierarchy, ever more important. This article presents lock cohorting, a general new technique for designing NUMA-aware locks that is as simple as it is powerful. Lock cohorting allows one to transform any spin-lock algorithm, with minimal nonintrusive changes,into a scalable NUMA-aware spin-lock. Our new cohorting technique allows us to easily create NUMA-aware versions of the TATAS-Backoff, CLH, MCS, and ticket locks, to name a few. Moreover, it allows us to derive a CLH-based cohort abortable lock, the first NUMA-aware queue lock to support abortability. We empirically compared the performance of cohort locks with prior NUMA-aware and classic NUMA-oblivious locks on a synthetic micro-benchmark, a real world key-value store application memcached, as well as the libc memory allocator. Our results demonstrate that cohort locks perform as well or better than known locks when the load is low and significantly out-perform them as the load increases.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Merrill:2015:HPS, author = "Duane Merrill and Michael Garland and Andrew Grimshaw", title = "High-Performance and Scalable {GPU} Graph Traversal", journal = j-TOPC, volume = "1", number = "2", pages = "14:1--14:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2717511", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Breadth-First Search (BFS) is a core primitive for graph traversal and a basis for many higher-level graph analysis algorithms. It is also representative of a class of parallel computations whose memory accesses and work distribution are both irregular and data dependent. Recent work has demonstrated the plausibility of GPU sparse graph traversal, but has tended to focus on asymptotically inefficient algorithms that perform poorly on graphs with nontrivial diameter. We present a BFS parallelization focused on fine-grained task management constructed from efficient prefix sum computations that achieves an asymptotically optimal O(|V| + |E|) gd work complexity. Our implementation delivers excellent performance on diverse graphs, achieving traversal rates in excess of 3.3 billion and 8.3 billion traversed edges per second using single- and quad-GPU configurations, respectively. This level of performance is several times faster than state-of-the-art implementations on both CPU and GPU platforms.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Kramer:2015:SET, author = "Stephan C. Kramer and Johannes Hagemann", title = "{SciPAL}: Expression Templates and Composition Closure Objects for High Performance Computational Physics with {CUDA} and {OpenMP}", journal = j-TOPC, volume = "1", number = "2", pages = "15:1--15:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686886", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present SciPAL (scientific parallel algorithms library), a C ++-based, hardware-independent open-source library. Its core is a domain-specific embedded language for numerical linear algebra. The main fields of application are finite element simulations, coherent optics and the solution of inverse problems. Using SciPAL algorithms can be stated in a mathematically intuitive way in terms of matrix and vector operations. Existing algorithms can easily be adapted to GPU-based computing by proper template specialization. Our library is compatible with the finite element library deal .II and provides a port of deal.II's most frequently used linear algebra classes to CUDA (NVidia's extension of the programming languages C and C ++ for programming their GPUs). SciPAL 's operator-based API for BLAS operations particularly aims at simplifying the usage of NVidia's CUBLAS. For non-BLAS array arithmetic SciPAL 's expression templates are able to generate CUDA kernels at compile time. We demonstrate the benefits of SciPAL using the iterative principal component analysis as example which is the core algorithm for the spike-sorting problem in neuroscience.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Totoni:2015:PME, author = "Ehsan Totoni and Nikhil Jain and Laxmikant V. Kale", title = "Power Management of Extreme-Scale Networks with On\slash Off Links in Runtime Systems", journal = j-TOPC, volume = "1", number = "2", pages = "16:1--16:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687001", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Feb 18 16:46:00 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Networks are among major power consumers in large-scale parallel systems. During execution of common parallel applications, a sizeable fraction of the links in the high-radix interconnects are either never used or are underutilized. We propose a runtime system based adaptive approach to turn off unused links, which has various advantages over the previously proposed hardware and compiler based approaches. We discuss why the runtime system is the best system component to accomplish this task, and test the effectiveness of our approach using real applications (including NAMD, MILC), and application benchmarks (including NAS Parallel Benchmarks, Stencil). These codes are simulated on representative topologies such as 6-D Torus and multilevel directly connected network (similar to IBM PERCS in Power 775 and Dragonfly in Cray Aries). For common applications with near-neighbor communication pattern, our approach can save up to 20\% of total machine's power and energy, without any performance penalty.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Herlihy:2015:GEI, author = "Maurice Herlihy", title = "{Guest Editor} Introduction", journal = j-TOPC, volume = "2", number = "1", pages = "1:1--1:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2716306", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Degener:2015:LCS, author = "Bastian Degener and Barbara Kempkes and Peter Kling and Friedhelm {Meyer Auf Der Heide}", title = "Linear and Competitive Strategies for Continuous Robot Formation Problems", journal = j-TOPC, volume = "2", number = "1", pages = "2:1--2:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742341", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We study a scenario in which n mobile robots with a limited viewing range are distributed in the Euclidean plane and have to solve a formation problem. The formation problems we consider are the Gathering problem and the Chain-Formation problem. In the Gathering problem, the robots have to gather in one (not predefined) point, while in the Chain-Formation problem they have to form a connected communication chain of minimal length between two stationary base stations. Each robot may base its decisions where to move only on the current relative positions of neighboring robots (that are within its viewing range); that is, besides having a limited viewing range, the robots are oblivious (they do not use information from the past), have none or only very limited identities, and they do not have a common sense of direction. Variants of these problems (especially for the Gathering problem) have been studied extensively in different discrete time models. In contrast, our work focuses on a continuous time model; that is, the robots continuously sense the positions of other robots within their viewing range and continuously adapt their speed and direction according to some simple, local rules. Hereby, we assume that the robots have a maximum movement speed of one. We show that this idealized idea of continuous sensing allows us to solve the mentioned formation problems in linear time $ O(n) $ (which, given the maximum speed of one, immediately yields a maximum traveled distance of $ O(n)$). Note that in the more classical discrete time models, the best known strategies need at least $ \Theta (n^2)$ or even $ \Theta (n^2 \log n)$ timesteps to solve these problems. For the Gathering problem, our analysis solves a problem left open by Gordon et al. [2004], where the authors could prove that gathering in a continuous model is possible in finite time, but were not able to give runtime bounds. Apart from these linear bounds, we also provide runtime bounds for both formation problems that relate the runtime of our strategies to the runtime of an optimal, global algorithm. Specifically, we show that our strategy for the Gathering problem is log OPT-competitive and the strategy for the Chain-Formation problem is $ \log n$ competitive. Here, by $c$-competitive, we mean that our (local) strategy is asymptotically by at most a factor of $c$ slower than an optimal, global strategy.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Jain:2015:NOS, author = "Navendu Jain and Ishai Menache and Joseph (Seffi) Naor and Jonathan Yaniv", title = "Near-Optimal Scheduling Mechanisms for Deadline-Sensitive Jobs in Large Computing Clusters", journal = j-TOPC, volume = "2", number = "1", pages = "3:1--3:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742343", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We consider a market-based resource allocation model for batch jobs in cloud computing clusters. In our model, we incorporate the importance of the due date of a job rather than the number of servers allocated to it at any given time. Each batch job is characterized by the work volume of total computing units (e.g., CPU hours) along with a bound on maximum degree of parallelism. Users specify, along with these job characteristics, their desired due date and a value for finishing the job by its deadline. Given this specification, the primary goal is to determine the scheduling of cloud computing instances under capacity constraints in order to maximize the social welfare (i.e., sum of values gained by allocated users). Our main result is a new (CC-kcss-1)-approximation algorithm for this objective, where $C$ denotes cloud capacity, $k$ is the maximal bound on parallelized execution (in practical settings, $ k < C$) and $s$ is the slackness on the job completion time, that is, the minimal ratio between a specified deadline and the earliest finish time of a job. Our algorithm is based on utilizing dual fitting arguments over a strengthened linear program to the problem. Based on the new approximation algorithm, we construct truthful allocation and pricing mechanisms, in which reporting the true value and other properties of the job (deadline, work volume, and the parallelism bound) is a dominant strategy for all users. To that end, we extend known results for single-value settings to provide a general framework for transforming allocation algorithms into truthful mechanisms in domains of single-value and multi-properties. We then show that the basic mechanism can be extended under proper Bayesian assumptions to the objective of maximizing revenues, which is important for public clouds. We empirically evaluate the benefits of our approach through simulations on data-center job traces, and show that the revenues obtained under our mechanism are comparable with an ideal fixed-price mechanism, which sets an on-demand price using oracle knowledge of users' valuations. Finally, we discuss how our model can be extended to accommodate uncertainties in job work volumes, which is a practical challenge in cloud settings.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Feldman:2015:HCG, author = "Moran Feldman and Liane Lewin-Eytan and Joseph (Seffi) Naor", title = "Hedonic Clustering Games", journal = j-TOPC, volume = "2", number = "1", pages = "4:1--4:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742345", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Clustering, the partitioning of objects with respect to a similarity measure, has been extensively studied as a global optimization problem. We investigate clustering from a game-theoretic approach, and consider the class of hedonic clustering games. Here, a self-organized clustering is obtained via decisions made by independent players, corresponding to the elements clustered. Being a hedonic setting, the utility of each player is determined by the identity of the other members of her cluster. This class of games seems to be quite robust, as it fits with rather different, yet commonly used, clustering criteria. Specifically, we investigate hedonic clustering games in two different models: fixed clustering, which subdivides into $k$-median and $k$-center, and correlation clustering. We provide a thorough analysis of these games, characterizing Nash equilibria, and proving upper and lower bounds on the price of anarchy and price of stability. For fixed clustering we focus on the existence of a Nash equilibrium, as it is a rather nontrivial issue in this setting. We study it both for general metrics and special cases, such as line and tree metrics. In the correlation clustering model, we study both minimization and maximization variants, and provide almost tight bounds on both the price of anarchy and price of stability.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Jahn:2015:RRA, author = "Janmartin Jahn and Santiago Pagani and Sebastian Kobbe and Jian-Jia Chen and J{\"o}rg Henkel", title = "Runtime Resource Allocation for Software Pipelines", journal = j-TOPC, volume = "2", number = "1", pages = "5:1--5:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742347", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Efficiently allocating the computational resources of many-core systems is one of the most prominent challenges, especially when resource requirements may vary unpredictably at runtime. This is even more challenging when facing unreliable cores --- a scenario that becomes common as the number of cores increases and integration sizes shrink. To address this challenge, this article presents an optimal method for the allocation of the resources to software-pipelined applications. Here we show how runtime observations of the resource requirements of tasks can be used to adapt resource allocations. Furthermore, we show how the optimum can be traded for a high degree of scalability by clustering applications in a distributed, hierarchical manner. To diminish the negative effects of unreliable cores, this article shows how self-organization can effectively restore the integrity of such a hierarchy when it is corrupted by a failing core. Experiments on Intel's 48-core Single-Chip Cloud Computer and in a many-core simulator show that a significant improvement in system throughput can be achieved over the current state of the art.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Xu:2015:SVC, author = "Yi Xu and Bo Zhao and Youtao Zhang and Jun Yang", title = "Simple Virtual Channel Allocation for High-Throughput and High-Frequency On-Chip Routers", journal = j-TOPC, volume = "2", number = "1", pages = "6:1--6:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742349", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Packet-switched network-on-chip (NoC) has provided a scalable solution to the communications for tiled multicore processors. However, the virtual channel (VC) buffers in the NoC consume significant dynamic and leakage power. To improve the energy efficiency of the router design, it is advantageous to use small buffer sizes while still maintaining throughput of the network. This article proposes two new virtual channel allocation (VA) mechanisms, termed fixed VC assignment with dynamic VC allocation (FVADA) and adjustable VC assignment with dynamic VC allocation (AVADA). VCs are designated to output ports and allocated to packets according to such assignment. This can help to reduce the head-of-line blocking. Such VC-output port assignment can also be adjusted dynamically to accommodate traffic changes. Simulation results show that both mechanisms can improve network throughput by 41\% on average. Real traffic evaluation shows a network latency reduction of up to 66\%. In addition, AVADA can outperform the baseline in throughput with only half of the buffer size. Finally, we are able to achieve comparable or better throughput than a previous dynamic VC allocator while reducing its critical path delay by 57\%. Hence, the proposed VA mechanisms are suitable for low-power, high-throughput, and high-frequency NoC designs.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Hammouda:2015:NTE, author = "Adam Hammouda and Andrew R. Siegel and Stephen F. Siegel", title = "Noise-Tolerant Explicit Stencil Computations for Nonuniform Process Execution Rates", journal = j-TOPC, volume = "2", number = "1", pages = "7:1--7:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742351", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Next-generation HPC computing platforms are likely to be characterized by significant, unpredictable nonuniformities in execution time among compute nodes and cores. The resulting load imbalances from this nonuniformity are expected to arise from a variety of sources-manufacturing discrepancies, dynamic power management, runtime component failure, OS jitter, software-mediated resiliency, and TLB/- cache performance variations, for example. It is well understood that existing algorithms with frequent points of bulk synchronization will perform relatively poorly in the presence of these sources of process nonuniformity. Thus, recasting classic bulk synchronous algorithms into more asynchronous, coarse-grained parallelism is a critical area of research for next-generation computing. We propose a class of parallel algorithms for explicit stencil computations that can tolerate these nonuniformities by decoupling per process communication and computation in order for each process to progress asynchronously while maintaining solution correctness. These algorithms are benchmarked with a $1$D domain decomposed (``slabbed'') implementation of the $2$D heat equation as a model problem, and are tested in the presence of simulated nonuniform process execution rates. The resulting performance is compared to a classic bulk synchronous implementation of the model problem. Results show that the runtime of this article's algorithm on a machine with simulated process nonuniformities is 5--99\% slower than the runtime of its classic counterpart on a machine free of nonuniformities. However, when both algorithms are run on a machine with comparable synthetic process nonuniformities, this article's algorithm is 1--37 times faster than its classic counterpart.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{McCreesh:2015:SST, author = "Ciaran McCreesh and Patrick Prosser", title = "The Shape of the Search Tree for the Maximum Clique Problem and the Implications for Parallel Branch and Bound", journal = j-TOPC, volume = "2", number = "1", pages = "8:1--8:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2742359", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Thu May 21 16:27:00 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Finding a maximum clique in a given graph is one of the fundamental NP-hard problems. We compare two multicore thread-parallel adaptations of a state-of-the-art branch-and-bound algorithm for the maximum clique problem and provide a novel explanation as to why they are successful. We show that load balance is sometimes a problem but that the interaction of parallel search order and the most likely location of solutions within the search space is often the dominating consideration. We use this explanation to propose a new low-overhead, scalable work-splitting mechanism. Our approach uses explicit early diversity to avoid strong commitment to the weakest heuristic advice and late resplitting for balance. More generally, we argue that, for branch-and-bound, parallel algorithm design should not be performed independently of the underlying sequential algorithm.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Hoefler:2015:RMA, author = "Torsten Hoefler and James Dinan and Rajeev Thakur and Brian Barrett and Pavan Balaji and William Gropp and Keith Underwood", title = "Remote Memory Access Programming in {MPI-3}", journal = j-TOPC, volume = "2", number = "2", pages = "9:1--9:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2780584", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Aug 7 10:22:35 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/pvm.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "The Message Passing Interface (MPI) 3.0 standard, introduced in September 2012, includes a significant update to the one-sided communication interface, also known as remote memory access (RMA). In particular, the interface has been extended to better support popular one-sided and global-address-space parallel programming models to provide better access to hardware performance features and enable new data-access modes. We present the new RMA interface and specify formal axiomatic models for data consistency and access semantics. Such models can help users reason about details of the semantics that are hard to extract from the English prose in the standard. It also fosters the development of tools and compilers, enabling them to automatically analyze, optimize, and debug RMA programs.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Maldonado:2015:STB, author = "Walther Maldonado and Patrick Marlier and Pascal Felber and Julia Lawall and Gilles Muller and Etienne Rivi{\`e}re", title = "Supporting Time-Based {QoS} Requirements in Software Transactional Memory", journal = j-TOPC, volume = "2", number = "2", pages = "10:1--10:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2779621", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Aug 7 10:22:35 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Software transactional memory (STM) is an optimistic concurrency control mechanism that simplifies parallel programming. However, there has been little interest in its applicability to reactive applications in which there is a required response time for certain operations. We propose supporting such applications by allowing programmers to associate time with atomic blocks in the form of deadlines and quality-of-service (QoS) requirements. Based on statistics of past executions, we adjust the execution mode of transactions by decreasing the level of optimism as the deadline approaches. In the presence of concurrent deadlines, we propose different conflict resolution policies. Execution mode switching mechanisms allow the meeting of multiple deadlines in a consistent manner, with potential QoS degradations being split fairly among several threads as contention increases, and avoiding starvation. Our implementation consists of extensions to an STM runtime that allow gathering statistics and switching execution modes. We also propose novel contention managers adapted to transactional workloads subject to deadlines. The experimental evaluation shows that our approaches significantly improve the likelihood of a transaction meeting its deadline and QoS requirement, even in cases where progress is hampered by conflicts and other concurrent transactions with deadlines.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Kestor:2015:TPD, author = "Gokcen Kestor and Osman S. Unsal and Adrian Cristal and Serdar Tasiran", title = "{TRADE}: Precise Dynamic Race Detection for Scalable Transactional Memory Systems", journal = j-TOPC, volume = "2", number = "2", pages = "11:1--11:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2786021", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Aug 7 10:22:35 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "As other multithreaded programs, transactional memory (TM) programs are prone to race conditions. Previous work focuses on extending existing definitions of data race for lock-based applications to TM applications, which requires all transactions to be totally ordered ``as if'' serialized by a global lock. This approach poses implementation constraints on the STM that severely limits TM applications' performance. This article shows that forcing total ordering among all running transactions, while sufficient, is not necessary. We introduce an alternative data race definition, relaxed transactional data race, that requires ordering of only conflicting transactions. The advantages of our relaxed definition are twofold: First, unlike the previous definition, this definition can be applied to a wide range of TMs, including those that do not enforce transaction total ordering. Second, within a single execution, it exposes a higher number of data races, which considerably reduces debugging time. Based on this definition, we propose a novel and precise race detection tool for C/C++ TM applications (TRADE), which detects data races by tracking happens-before edges among conflicting transactions. Our experiments reveal that TRADE precisely detects data races for STAMP applications running on modern STMs with overhead comparable to state-of-the-art race detectors for lock-based applications. Our experiments also show that in a single run, TRADE identifies several races not discovered by 10 separate runs of a race detection tool based on the previous data race definition.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Diegues:2015:TWE, author = "Nuno Diegues and Paolo Romano", title = "{Time-Warp}: Efficient Abort Reduction in Transactional Memory", journal = j-TOPC, volume = "2", number = "2", pages = "12:1--12:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2775435", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Aug 7 10:22:35 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "The multicore revolution that took place one decade ago has turned parallel programming into a major concern for the mainstream software development industry. In this context, Transactional Memory (TM) has emerged as a simpler and attractive alternative to that of lock-based synchronization, whose complexity and error-proneness are widely recognized. The notion of permissiveness in TM translates to only aborting a transaction when it cannot be accepted in any history that guarantees a target correctness criterion. This theoretically powerful property is often neglected by state-of-the-art TMs because it imposes considerable algorithmic costs. Instead, these TMs opt to maximize their implementation's efficiency by aborting transactions under overly conservative conditions. As a result, they risk rejecting a significant number of safe executions. In this article, we seek to identify a sweet spot between permissiveness and efficiency by introducing the Time-Warp Multiversion (TWM) algorithm. TWM is based on the key idea of allowing an update transaction that has performed stale reads (i.e., missed the writes of concurrently committed transactions) to be serialized by ``committing it in the past,'' which we call a time-warp commit. At its core, TWM uses a novel, lightweight validation mechanism with little computational overhead. TWM also guarantees that read-only transactions can never be aborted. Further, TWM guarantees Virtual World Consistency, a safety property that is deemed as particularly relevant in the context of TM. We demonstrate the practicality of this approach through an extensive experimental study: we compare TWM with five other TMs, representative of typical alternative design choices, and on a wide variety of benchmarks. This study shows an average performance improvement across all considered workloads and TMs of 65\% in high concurrency scenarios, with gains extending up to $9 \times $ with the most favorable benchmarks. These results are a consequence of TWM's ability to achieve drastic reduction of aborts in scenarios of nonminimal contention, while introducing little overhead (approximately 10\%) in worst-case, synthetically designed scenarios (i.e., no contention or contention patterns that cannot be optimized using TWM).", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Eyraud-Dubois:2015:PST, author = "Lionel Eyraud-Dubois and Loris Marchal and Oliver Sinnen and Fr{\'e}d{\'e}ric Vivien", title = "Parallel Scheduling of Task Trees with Limited Memory", journal = j-TOPC, volume = "2", number = "2", pages = "13:1--13:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2779052", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Aug 7 10:22:35 MDT 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "This article investigates the execution of tree-shaped task graphs using multiple processors. Each edge of such a tree represents some large data. A task can only be executed if all input and output data fit into memory, and a data can only be removed from memory after the completion of the task that uses it as an input data. Such trees arise in the multifrontal method of sparse matrix factorization. The peak memory needed for the processing of the entire tree depends on the execution order of the tasks. With one processor, the objective of the tree traversal is to minimize the required memory. This problem was well studied, and optimal polynomial algorithms were proposed. Here, we extend the problem by considering multiple processors, which is of obvious interest in the application area of matrix factorization. With multiple processors comes the additional objective to minimize the time needed to traverse the tree-that is, to minimize the makespan. Not surprisingly, this problem proves to be much harder than the sequential one. We study the computational complexity of this problem and provide inapproximability results even for unit weight trees. We design a series of practical heuristics achieving different trade-offs between the minimization of peak memory usage and makespan. Some of these heuristics are able to process a tree while keeping the memory usage under a given memory limit. The different heuristics are evaluated in an extensive experimental evaluation using realistic trees.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Dinitz:2015:ISI, author = "Michael Dinitz and Torsten Hoefler", title = "Introduction to the Special Issue on {SPAA 2013}", journal = j-TOPC, volume = "2", number = "3", pages = "14:1--14:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2809923", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "14e", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Kumar:2015:FGA, author = "Ravi Kumar and Benjamin Moseley and Sergei Vassilvitskii and Andrea Vattani", title = "Fast Greedy Algorithms in {MapReduce} and Streaming", journal = j-TOPC, volume = "2", number = "3", pages = "14:1--14:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2809814", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Greedy algorithms are practitioners' best friends---they are intuitive, are simple to implement, and often lead to very good solutions. However, implementing greedy algorithms in a distributed setting is challenging since the greedy choice is inherently sequential, and it is not clear how to take advantage of the extra processing power. Our main result is a powerful sampling technique that aids in parallelization of sequential algorithms. Armed with this primitive, we then adapt a broad class of greedy algorithms to the MapReduce paradigm; this class includes maximum cover and submodular maximization subject to p -system constraint problems. Our method yields efficient algorithms that run in a logarithmic number of rounds while obtaining solutions that are arbitrarily close to those produced by the standard sequential greedy algorithm. We begin with algorithms for modular maximization subject to a matroid constraint and then extend this approach to obtain approximation algorithms for submodular maximization subject to knapsack or p -system constraints.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Sanders:2015:WEM, author = "Peter Sanders and Jochen Speck and Raoul Steffen", title = "Work-Efficient Matrix Inversion in Polylogarithmic Time", journal = j-TOPC, volume = "2", number = "3", pages = "15:1--15:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2809812", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present an algorithm for inversion of symmetric positive definite matrices that combines the practical requirement of an optimal number of arithmetic operations and the theoretical goal of a polylogarithmic critical path length. The algorithm reduces inversion to matrix multiplication. It uses Strassen's recursion scheme, but on the critical path it breaks the recursion early, switching to an asymptotically inefficient yet fast use of Newton's method. We also show that the algorithm is numerically stable. Overall, we get a candidate for a massively parallel algorithm that scales to exascale systems even on relatively small inputs.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Gilbert:2015:SBO, author = "Seth Gilbert and Chaodong Zheng", title = "{SybilCast}: Broadcast on the Open Airwaves", journal = j-TOPC, volume = "2", number = "3", pages = "16:1--16:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2809810", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Consider a scenario where many wireless users are attempting to download data from a single base station. While most of the users are honest, some users may be malicious and attempt to obtain more than their fair share of the bandwidth. One possible strategy for attacking the system is to simulate multiple fake identities, each of which is given its own equal share of the bandwidth. Such an attack is often referred to as a sybil attack. To counter such behavior, we propose SybilCast, a protocol for multichannel wireless networks that limits the number of fake identities and, in doing so, ensures that each honest user gets at least a constant fraction of his or her fair share of the bandwidth. As a result, each honest user can complete his or her data download in asymptotically optimal time. A key aspect of this protocol is balancing the rate at which new identities are admitted and the maximum number of fake identities that can coexist while keeping the overhead low. Besides sybil attacks, our protocol can also tolerate spoofing and jamming.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Lee:2015:FPP, author = "I-Ting Angelina Lee and Charles E. Leiserson and Tao B. Schardl and Zhunping Zhang and Jim Sukha", title = "On-the-Fly Pipeline Parallelism", journal = j-TOPC, volume = "2", number = "3", pages = "17:1--17:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2809808", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Pipeline parallelism organizes a parallel program as a linear sequence of stages. Each stage processes elements of a data stream, passing each processed data element to the next stage, and then taking on a new element before the subsequent stages have necessarily completed their processing. Pipeline parallelism is used especially in streaming applications that perform video, audio, and digital signal processing. Three out of 13 benchmarks in PARSEC, a popular software benchmark suite designed for shared-memory multiprocessors, can be expressed as pipeline parallelism. Whereas most concurrency platforms that support pipeline parallelism use a ``construct-and-run'' approach, this article investigates ``on-the-fly'' pipeline parallelism, where the structure of the pipeline emerges as the program executes rather than being specified a priori. On-the-fly pipeline parallelism allows the number of stages to vary from iteration to iteration and dependencies to be data dependent. We propose simple linguistics for specifying on-the-fly pipeline parallelism and describe a provably efficient scheduling algorithm, the P iper algorithm, which integrates pipeline parallelism into a work-stealing scheduler, allowing pipeline and fork-join parallelism to be arbitrarily nested. The Piper algorithm automatically throttles the parallelism, precluding ``runaway'' pipelines. Given a pipeline computation with $ T_1 $ work and $ T_\infty $ span (critical-path length), Piper executes the computation on $P$ processors in $ T_P \leq T_1 / P + O(T \infty + \lg P)$ expected time. Piper also limits stack space, ensuring that it does not grow unboundedly with running time. We have incorporated on-the-fly pipeline parallelism into a Cilk-based work-stealing runtime system. Our prototype Cilk-P implementation exploits optimizations such as ``lazy enabling'' and ``dependency folding.'' We have ported the three PARSEC benchmarks that exhibit pipeline parallelism to run on Cilk-P. One of these, x264, cannot readily be executed by systems that support only construct-and-run pipeline parallelism. Benchmark results indicate that Cilk-P has low serial overhead and good scalability. On x264, for example, Cilk-P exhibits a speedup of 13.87 over its respective serial counterpart when running on 16 processors.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Eikel:2015:IRI, author = "Martina Eikel and Christian Scheideler", title = "{IRIS}: a Robust Information System Against Insider {DoS} Attacks", journal = j-TOPC, volume = "2", number = "3", pages = "18:1--18:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2809806", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "In this work, we present the first scalable distributed information system, that is, a system with low storage overhead, that is provably robust against denial-of-service (DoS) attacks by a current insider. We allow a current insider to have complete knowledge about the information system and to have the power to block any \varepsilon-fraction of its servers by a DoS attack, where \varepsilon can be chosen up to a constant. The task of the system is to serve any collection of lookup requests with at most one per nonblocked server in an efficient way despite this attack. Previously, scalable solutions were only known for DoS attacks of past insiders, where a past insider only has complete knowledge about some past time point $ t_0 $ of the information system. Scheideler et al. [Awerbuch and Scheideler 2007; Baumgart et al. 2009] showed that in this case, it is possible to design an information system so that any information that was inserted or last updated after $ t_0 $ is safe against a DoS attack. But their constructions would not work at all for a current insider. The key idea behind our IRIS system is to make extensive use of coding. More precisely, we present two alternative distributed coding strategies with an at most logarithmic storage overhead that can handle up to a constant fraction of blocked servers.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Kling:2015:PSM, author = "Peter Kling and Peter Pietrzyk", title = "Profitable Scheduling on Multiple Speed-Scalable Processors", journal = j-TOPC, volume = "2", number = "3", pages = "19:1--19:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2809872", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present a new online algorithm for profit-oriented scheduling on multiple speed-scalable processors and provide a tight analysis of the algorithm's competitiveness. Our results generalize and improve upon work by Chan et al. [2010], which considers a single speed-scalable processor. Using significantly different techniques, we can not only extend their model to multiprocessors but also prove an enhanced and tight competitive ratio for our algorithm. In our scheduling problem, jobs arrive over time and are preemptable. They have different workloads, values, and deadlines. The scheduler may decide not to finish a job but instead to suffer a loss equaling the job's value. However, to process a job's workload until its deadline the scheduler must invest a certain amount of energy. The cost of a schedule is the sum of lost values and invested energy. In order to finish a job, the scheduler has to determine which processors to use and set their speeds accordingly. A processor's energy consumption is power $ P_\alpha (s) $ integrated over time, where $ P_\alpha (s) = s^\alpha $ is the power consumption when running at speed $s$. Since we consider the online variant of the problem, the scheduler has no knowledge about future jobs. This problem was introduced by Chan et al. [2010] for the case of a single processor. They presented an online algorithm that is $ \alpha^\alpha + 2 e \alpha $-competitive. We provide an online algorithm for the case of multiple processors with an improved competitive ratio of $ \alpha^\alpha $.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Dutta:2015:CBR, author = "Chinmoy Dutta and Gopal Pandurangan and Rajmohan Rajaraman and Scott Roche", title = "Coalescing-Branching Random Walks on Graphs", journal = j-TOPC, volume = "2", number = "3", pages = "20:1--20:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2817830", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Nov 3 07:30:42 MST 2015", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We study a distributed randomized information propagation mechanism in networks we call the coalescing-branching random walk (cobra walk, for short). A cobra walk is a generalization of the well-studied ``standard'' random walk, and is useful in modeling and understanding the Susceptible-Infected- Susceptible (SIS)-type of epidemic processes in networks. It can also be helpful in performing light-weight information dissemination in resource-constrained networks. A cobra walk is parameterized by a branching factor $k$. The process starts from an arbitrary vertex, which is labeled active for step 1. In each step of a cobra walk, each active vertex chooses $k$ random neighbors to become active for the next step (``branching''). A vertex is active for step $ t + 1$ only if it is chosen by an active vertex in step $t$ (``coalescing''). This results in a stochastic process in the underlying network with properties that are quite different from both the standard random walk (which is equivalent to the cobra walk with branching factor 1) as well as other gossip-based rumor spreading mechanisms. We focus on the cover time of the cobra walk, which is the number of steps for the walk to reach all the vertices, and derive almost-tight bounds for various graph classes. We show an $ O(\log^2 n)$ high probability bound for the cover time of cobra walks on expanders, if either the expansion factor or the branching factor is sufficiently large; we also obtain an $ O(\log n)$ high probability bound for the partial cover time, which is the number of steps needed for the walk to reach at least a constant fraction of the vertices. We also show that the cover time of the cobra walk is, with high probability, $ O(n \log n)$ on any $n$-vertex tree for $ k \geq 2$, $ {\~ O}(n^{1 / d})$ on a $d$-dimensional grid for $ k \geq 2$, and $ O(\log n)$ on the complete graph.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Larus:2016:ISI, author = "James Larus and Sandhya Dwarkadas and Jos{\'e} Moreira and Andrew Lumsdaine", title = "Introduction to the Special Issue on {PPoPP'14}", journal = j-TOPC, volume = "2", number = "4", pages = "21:1--21:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2856513", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{Herlihy:2016:WSF, author = "Maurice Herlihy and Zhiyu Liu", title = "Well-Structured Futures and Cache Locality", journal = j-TOPC, volume = "2", number = "4", pages = "22:1--22:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2858650", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "In fork-join parallelism, a sequential program is split into a directed acyclic graph of tasks linked by directed dependency edges, and the tasks are executed, possibly in parallel, in an order consistent with their dependencies. A popular and effective way to extend fork-join parallelism is to allow threads to create futures. A thread creates a future to hold the results of a computation, which may or may not be executed in parallel. That result is returned when some thread touches that future, blocking if necessary until the result is ready. Recent research has shown that although futures can, of course, enhance parallelism in a structured way, they can have a deleterious effect on cache locality. In the worst case, futures can incur $ \Omega (P T_\infty + t T_\infty) $ deviations, which implies $ \Omega (C P T_\infty + C t T_\infty) $ additional cache misses, where $C$ is the number of cache lines, $P$ is the number of processors, $t$ is the number of touches, and $ T_\infty $ is the computation span. Since cache locality has a large impact on software performance on modern multicores, this result is troubling. In this article, we show that if futures are used in a simple, disciplined way, then the situation is much better: if each future is touched only once, either by the thread that created it or by a later descendant of the thread that created it, then parallel executions with work stealing can incur at most $ O(C P T^2_\infty)$ additional cache misses-a substantial improvement. This structured use of futures is characteristic of many (but not all) parallel applications.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{Thomson:2016:CTU, author = "Paul Thomson and Alastair F. Donaldson and Adam Betts", title = "Concurrency Testing Using Controlled Schedulers: an Empirical Study", journal = j-TOPC, volume = "2", number = "4", pages = "23:1--23:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2858651", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present an independent empirical study on concurrency testing using controlled schedulers. We have gathered 49 buggy concurrent software benchmarks, drawn from public code bases, which we call SCTBench. We applied a modified version of an existing concurrency testing tool to SCTBench, testing five controlled scheduling techniques: depth-first search, preemption bounding, delay bounding, a controlled random scheduler, and probabilistic concurrency testing (PCT). We attempt to answer several research questions: Which technique performs the best, in terms of bug finding ability? How effective are the two main schedule bounding techniques-preemption bounding and delay bounding-at finding bugs? What challenges are associated with applying concurrency testing techniques to existing code? Can we classify certain benchmarks as trivial or nontrivial? Overall, we found that PCT (with parameter d = 3) was the most effective technique in terms of bug finding; it found all bugs found by the other techniques, plus an additional three, and it missed only one bug. Surprisingly, we found that the naive controlled random scheduler, which randomly chooses one thread to execute at each scheduling point, performs well, finding more bugs than preemption bounding and just two fewer bugs than delay bounding. Our findings confirm that delay bounding is superior to preemption bounding and that schedule bounding is superior to an unbounded depth-first search. The majority of bugs in SCTBench can be exposed using a small schedule bound (1--2), supporting previous claims, although one benchmark requires five preemptions. We found that the need to remove nondeterminism and control all synchronization (as is required for systematic concurrency testing) can be nontrivial. There were eight distinct programs that could not easily be included in out study, such as those that perform network and interprocess communication. We report various properties about the benchmarks tested, such as the fact that the bugs in 18 benchmarks were exposed 50\% of the time when using random scheduling. We note that future work should not use the benchmarks that we classify as trivial when presenting new techniques, other than as a minimum baseline. We have made SCTBench and our tools publicly available for reproducibility and use in future work.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{Petrovic:2016:LHM, author = "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e} Schiper", title = "Leveraging Hardware Message Passing for Efficient Thread Synchronization", journal = j-TOPC, volume = "2", number = "4", pages = "24:1--24:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2858652", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "As the level of parallelism in manycore processors keeps increasing, providing efficient mechanisms for thread synchronization in concurrent programs is becoming a major concern. On cache-coherent shared-memory processors, synchronization efficiency is ultimately limited by the performance of the underlying cache coherence protocol. This article studies how hardware support for message passing can improve synchronization performance. Considering the ubiquitous problem of mutual exclusion, we devise novel algorithms for (i) classic locking, where application threads obtain exclusive access to a shared resource prior to executing their critical sections (CSes), and (ii) delegation, where CSes are executed by special threads. For classic locking, our HybLock algorithm uses a mix of shared memory and hardware message passing, which introduces the idea of hybrid synchronization algorithms. For delegation, we propose mp-server and HybComb: the former is a straightforward adaptation of the server approach to hardware message passing, whereas the latter is a novel hybrid combining algorithm. Evaluation on Tilera's TILE-Gx processor shows that HybLock outperforms the best known classic locks. Furthermore, mp-server can execute contended CSes with unprecedented throughput, as stalls related to cache coherence are removed from the critical path. HybComb can achieve comparable performance while avoiding the need to dedicate server cores. Consequently, our queue and stack implementations, based on the new synchronization algorithms, largely outperform their most efficient shared-memory-only counterparts.", acknowledgement = ack-nhfb, articleno = "24", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{Tardieu:2016:XAP, author = "Olivier Tardieu and Benjamin Herta and David Cunningham and David Grove and Prabhanjan Kambadur and Vijay Saraswat and Avraham Shinnar and Mikio Takeuchi and Mandana Vaziri and Wei Zhang", title = "{X10} and {APGAS} at Petascale", journal = j-TOPC, volume = "2", number = "4", pages = "25:1--25:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2894746", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "X10 is a high-performance, high-productivity programming language aimed at large-scale distributed and shared-memory parallel applications. It is based on the Asynchronous Partitioned Global Address Space (APGAS) programming model, supporting the same fine-grained concurrency mechanisms within and across shared-memory nodes. We demonstrate that X10 delivers solid performance at petascale by running (weak scaling) eight application kernels on an IBM Power--775 supercomputer utilizing up to 55,680 Power7 cores (for 1.7Pflop/s of theoretical peak performance). For the four HPC Class 2 Challenge benchmarks, X10 achieves 41\% to 87\% of the system's potential at scale (as measured by IBM's HPCC Class 1 optimized runs). We also implement K-Means, Smith-Waterman, Betweenness Centrality, and Unbalanced Tree Search (UTS) for geometric trees. Our UTS implementation is the first to scale to petaflop systems. We describe the advances in distributed termination detection, distributed load balancing, and use of high-performance interconnects that enable X10 to scale out to tens of thousands of cores. We discuss how this work is driving the evolution of the X10 language, core class libraries, and runtime systems.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{Maleki:2016:LRM, author = "Saeed Maleki and Madanlal Musuvathi and Todd Mytkowicz", title = "Low-Rank Methods for Parallelizing Dynamic Programming Algorithms", journal = j-TOPC, volume = "2", number = "4", pages = "26:1--26:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2884065", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "This article proposes efficient parallel methods for an important class of dynamic programming problems that includes Viterbi, Needleman--Wunsch, Smith--Waterman, and Longest Common Subsequence. In dynamic programming, the subproblems that do not depend on each other, and thus can be computed in parallel, form stages or wavefronts. The methods presented in this article provide additional parallelism allowing multiple stages to be computed in parallel despite dependencies among them. The correctness and the performance of the algorithm relies on rank convergence properties of matrix multiplication in the tropical semiring, formed with plus as the multiplicative operation and max as the additive operation. This article demonstrates the efficiency of the parallel algorithm by showing significant speedups on a variety of important dynamic programming problems. In particular, the parallel Viterbi decoder is up to $ 24 \times $ faster (with 64 processors) than a highly optimized commercial baseline.", acknowledgement = ack-nhfb, articleno = "26", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{Yuan:2016:FCN, author = "Xin Yuan and Wickus Nienaber and Santosh Mahapatra", title = "On Folded-{Clos} Networks with Deterministic Single-Path Routing", journal = j-TOPC, volume = "2", number = "4", pages = "27:1--27:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2858654", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Folded-Clos networks, also known as fat-trees, have been widely used as interconnects in large-scale high-performance computing clusters. Although users often treat such interconnects as replacements of nonblocking crossbar switches that can carry out any permutation communication without contention, the networking capability of such interconnects without a centralized controller in computer communication environments is not well understood. In this article, we investigate nonblocking two-level folded-Clos networks with deterministic single-path routing, but no centralized controller, and establish the nonblocking condition. The results indicate that nonblocking two-level folded-Clos networks without a centralized controller are much more expensive to construct than the traditional nonblocking networks in the telecommunication environment. Practical two-level folded-Clos based interconnects are blocking. For such interconnects, we establish the lower bound for worst-case contention for permutations with any deterministic single-path routing scheme, show that existing routing schemes perform poorly in terms of worst-case contention for permutations, present a routing scheme that achieves the theoretical optimal, and empirically compare the performance of existing schemes with the optimal routing scheme. The techniques developed for two-level folded-Clos networks are further extended for the general fat-trees of any heights.", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{Sandes:2016:MMA, author = "Edans F. De O. Sandes and Guillermo Miranda and Xavier Martorell and Eduard Ayguade and George Teodoro and Alba C. M. A. {De Melo}", title = "{MASA}: a Multiplatform Architecture for Sequence Aligners with Block Pruning", journal = j-TOPC, volume = "2", number = "4", pages = "28:1--28:??", month = mar, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2858656", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 19 08:11:13 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Biological sequence alignment is a very popular application in Bioinformatics, used routinely worldwide. Many implementations of biological sequence alignment algorithms have been proposed for multicores, GPUs, FPGAs and CellBEs. These implementations are platform-specific; porting them to other systems requires considerable programming effort. This article proposes and evaluates MASA, a flexible and customizable software architecture that enables the execution of biological sequence alignment applications with three variants (local, global, and semiglobal) in multiple hardware/software platforms with block pruning, which is able to reduce significantly the amount of data processed. To attain our flexibility goals, we also propose a generic version of block pruning and developed multiple parallelization strategies as building blocks, including a new asynchronous dataflow-based parallelization, which may be combined to implement efficient aligners in different platforms. We provide four MASA aligner implementations for multicores (OmpSs and OpenMP), GPU (CUDA), and Intel Phi (OpenMP), showing that MASA is very flexible. The evaluation of our generic block pruning strategy shows that it significantly outperforms the previously proposed block pruning, being able to prune up to 66.5\% of the cells when using the new dataflow-based parallelization strategy.", acknowledgement = ack-nhfb, articleno = "28", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", remark = "Special Issue on PPoPP'14 conference.", } @Article{MeyeraufderHeide:2016:ISI, author = "Friedhelm {Meyer auf der Heide} and Peter Sanders and Nodari Sitchinava", title = "Introduction to the Special Issue on {SPAA 2014}", journal = j-TOPC, volume = "3", number = "1", pages = "1:1--1:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2936716", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:51 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Kaler:2016:EDD, author = "Tim Kaler and William Hasenplaugh and Tao B. Schardl and Charles E. Leiserson", title = "Executing Dynamic Data-Graph Computations Deterministically Using Chromatic Scheduling", journal = j-TOPC, volume = "3", number = "1", pages = "2:1--2:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2896850", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:51 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "A data-graph computation-popularized by such programming systems as Galois, Pregel, GraphLab, PowerGraph, and GraphChi-is an algorithm that performs local updates on the vertices of a graph. During each round of a data-graph computation, an update function atomically modifies the data associated with a vertex as a function of the vertex's prior data and that of adjacent vertices. A dynamic data-graph computation updates only an active subset of the vertices during a round, and those updates determine the set of active vertices for the next round. This article introduces Prism, a chromatic-scheduling algorithm for executing dynamic data-graph computations. Prism uses a vertex coloring of the graph to coordinate updates performed in a round, precluding the need for mutual-exclusion locks or other nondeterministic data synchronization. A multibag data structure is used by Prism to maintain a dynamic set of active vertices as an unordered set partitioned by color. We analyze Prism using work-span analysis. Let $ G = (V, E) $ be a degree-$ \Delta $ graph colored with \chi colors, and suppose that $ Q \subseteq V $ is the set of active vertices in a round. Define $ {\rm size} (Q) = | Q | + \Sigma_{v \in Q} {\rm deg}(v) $, which is proportional to the space required to store the vertices of $Q$ using a sparse-graph layout. We show that a $P$-processor execution of Prism performs updates in $Q$ using $O(\chi (l g (Q / \chi) + l g \Delta)) + l g P$ span and $ \Theta (s i z e (Q) + P)$ work. These theoretical guarantees are matched by good empirical performance. To isolate the effect of the scheduling algorithm on performance, we modified GraphLab to incorporate Prism and studied seven application benchmarks on a 12-core multicore machine. Prism executes the benchmarks 1.2 to 2.1 times faster than GraphLab's nondeterministic lock-based scheduler while providing deterministic behavior. This article also presents Prism-R, a variation of Prism that executes dynamic data-graph computations deterministically even when updates modify global variables with associative operations. Prism-R satisfies the same theoretical bounds as Prism, but its implementation is more involved, incorporating a multivector data structure to maintain a deterministically ordered set of vertices partitioned by color. Despite its additional complexity, Prism-R is only marginally slower than Prism. On the seven application benchmarks studied, Prism-R incurs a 7\% geometric mean overhead relative to Prism.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Im:2016:CST, author = "Sungjin Im and Benjamin Moseley and Kirk Pruhs and Eric Torng", title = "Competitively Scheduling Tasks with Intermediate Parallelizability", journal = j-TOPC, volume = "3", number = "1", pages = "4:1--4:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2938378", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:51 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We introduce a scheduling algorithm Intermediate-SRPT, and show that it is $O (\log P)$-competitive with respect to average flow time when scheduling jobs whose parallelizability is intermediate between being fully parallelizable and sequential. Here, the parameter P denotes the ratio between the maximum job size to the minimum. We also show a general matching lower bound on the competitive ratio. Our analysis builds on an interesting combination of potential function and local competitiveness arguments.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Bercea:2016:CMI, author = "Ioana O. Bercea and Navin Goyal and David G. Harris and Aravind Srinivasan", title = "On Computing Maximal Independent Sets of Hypergraphs in Parallel", journal = j-TOPC, volume = "3", number = "1", pages = "5:1--5:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2938436", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:51 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Whether or not the problem of finding maximal independent sets (MIS) in hypergraphs is in (R)NC is one of the fundamental problems in the theory of parallel computing. Essentially, the challenge is to design (randomized) algorithms in which the number of processors used is polynomial and the (expected) runtime is polylogarithmic in the size of the input. Unlike the well-understood case of MIS in graphs, for the hypergraph problem, our knowledge is quite limited despite considerable work. It is known that the problem is in RNC when the edges of the hypergraph have constant size. For general hypergraphs with n vertices and m edges, the fastest previously known algorithm works in time $O(\sqrt n) $ with $ \poly (m, n) $ processors. In this article, we give an EREW PRAM randomized algorithm that works in time $ n^{o (1)} $ with $O(n + m \log n) $ processors on general hypergraphs satisfying $ m \leq n^{o (1) \log \log n / \log \log \log n} $. We also give an EREW PRAM deterministic algorithm that runs in time $ n^\epsilon $ on a graph with $ m \leq n^{1 / \delta } $ edges, for any constants $ \delta $, $ \epsilon $; the number of processors is polynomial in $m$, $n$ for a fixed choice of $ \delta $, \epsilon . Our algorithms are based on a sampling idea that reduces the dimension of the hypergraph and employs the algorithm for constant dimension hypergraphs as a subroutine.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Bilo:2016:LBN, author = "Davide Bil{\`o} and Luciano Gual{\`a} and Stefano Leucci and Guido Proietti", title = "Locality-Based Network Creation Games", journal = j-TOPC, volume = "3", number = "1", pages = "6:1--6:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2938426", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:51 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Network creation games have been extensively studied, both by economists and computer scientists, due to their versatility in modeling individual-based community formation processes. These processes, in turn, are the theoretical counterpart of several economics, social, and computational applications on the Internet. In their several variants, these games model the tension of a player between the player's two antagonistic goals: to be as close as possible to the other players and to activate a cheapest possible set of links. However, the generally adopted assumption is that players have a common and complete information about the ongoing network, which is quite unrealistic in practice. In this article, we consider a more compelling scenario in which players have only limited information about the network in which they are embedded. More precisely, we explore the game-theoretic and computational implications of assuming that players have a complete knowledge of the network structure only up to a given radius k, which is one of the most qualified local-knowledge models used in distributed computing. In this respect, we define a suitable equilibrium concept, and we provide a comprehensive set of upper and lower bounds to the price of anarchy for the entire range of values of k and for the two classic variants of the game, namely, those in which a player's cost-besides the activation cost of the owned links-depends on the maximum/sum of all distances to the other nodes in the network, respectively. These bounds are assessed through an extensive set of experiments.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Jiang:2016:PPA, author = "Jiayang Jiang and Michael Mitzenmacher and Justin Thaler", title = "Parallel Peeling Algorithms", journal = j-TOPC, volume = "3", number = "1", pages = "7:1--7:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2938412", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:51 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "The analysis of several algorithms and data structures can be framed as a peeling process on a random hypergraph: vertices with degree less than k are removed until there are no vertices of degree less than k left. The remaining hypergraph is known as the k core. In this article, we analyze parallel peeling processes, in which in each round, all vertices of degree less than k are removed. It is known that, below a specific edge-density threshold, the k -core is empty with high probability. We show that, with high probability, below this threshold, only $1 / \log((k - 1) (r - 1)) \log \log n + O (1)$ rounds of peeling are needed to obtain the empty $k$-core for $4br$-uniform hypergraphs. This bound is tight up to an additive constant. Interestingly, we show that, above this threshold, $\Omega (\log n)$ rounds of peeling are required to find the nonempty $k$-core. Since most algorithms and data structures aim to peel to an empty $k$-core, this asymmetry appears fortunate. We verify the theoretical results both with simulation and with a parallel implementation using graphics processing units (GPUs). Our implementation provides insights into how to structure parallel peeling algorithms for efficiency in practice.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Simhadri:2016:EAS, author = "Harsha Vardhan Simhadri and Guy E. Blelloch and Jeremy T. Fineman and Phillip B. Gibbons and Aapo Kyrola", title = "Experimental Analysis of Space-Bounded Schedulers", journal = j-TOPC, volume = "3", number = "1", pages = "8:1--8:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2938389", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:51 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "The running time of nested parallel programs on shared-memory machines depends in significant part on how well the scheduler mapping the program to the machine is optimized for the organization of caches and processor cores on the machine. Recent work proposed ``space-bounded schedulers'' for scheduling such programs on the multilevel cache hierarchies of current machines. The main benefit of this class of schedulers is that they provably preserve locality of the program at every level in the hierarchy, which can result in fewer cache misses and better use of bandwidth than the popular work-stealing scheduler. On the other hand, compared to work stealing, space-bounded schedulers are inferior at load balancing and may have greater scheduling overheads, raising the question as to the relative effectiveness of the two schedulers in practice. In this article, we provide the first experimental study aimed at addressing this question. To facilitate this study, we built a flexible experimental framework with separate interfaces for programs and schedulers. This enables a head-to-head comparison of the relative strengths of schedulers in terms of running times and cache miss counts across a range of benchmarks. (The framework is validated by comparisons with the Intel{\reg} CilkTM Plus work-stealing scheduler.) We present experimental results on a 32-core Xeon{\reg} 7560 comparing work stealing, hierarchy-minded work stealing, and two variants of space-bounded schedulers on both divide-and-conquer microbenchmarks and some popular algorithmic kernels. Our results indicate that space-bounded schedulers reduce the number of L3 cache misses compared to work-stealing schedulers by 25\% to 65\% for most of the benchmarks, but incur up to 27\% additional scheduler and load-imbalance overhead. Only for memory-intensive benchmarks can the reduction in cache misses overcome the added overhead, resulting in up to a 25\% improvement in running time for synthetic benchmarks and about 20\% improvement for algorithmic kernels. We also quantify runtime improvements varying the available bandwidth per core (the ``bandwidth gap'') and show up to 50\% improvements in the running times of kernels as this gap increases fourfold. As part of our study, we generalize prior definitions of space-bounded schedulers to allow for more practical variants (while still preserving their guarantees) and explore implementation tradeoffs.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Sheikh:2016:SHJ, author = "Hafiz Fahad Sheikh and Ishfaq Ahmad", title = "Sixteen Heuristics for Joint Optimization of Performance, Energy, and Temperature in Allocating Tasks to Multi-Cores", journal = j-TOPC, volume = "3", number = "2", pages = "9:1--9:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2948973", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:52 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Three-way joint optimization of performance ( P ), energy ( E ), and temperature ( T ) in scheduling parallel tasks to multiple cores poses a challenge that is staggering in its computational complexity. The goal of the PET optimized scheduling ( PETOS ) problem is to minimize three quantities: the completion time of a task graph, the total energy consumption, and the peak temperature of the system. Algorithms based on conventional multi-objective optimization techniques can be designed for solving the PETOS problem. But their execution times are exceedingly high and hence their applicability is restricted merely to problems of modest size. Exacerbating the problem is the solution space that is typically a Pareto front since no single solution can be strictly best along all three objectives. Thus, not only is the absolute quality of the solutions important but ``the spread of the solutions'' along each objective and the distribution of solutions within the generated tradeoff front are also desired. A natural alternative is to design efficient heuristic algorithms that can generate good solutions as well as good spreads --- note that most of the prior work in energy-efficient task allocation is predominantly single- or dual-objective oriented. Given a directed acyclic graph (DAG) representing a parallel program, a heuristic encompasses policies as to what tasks should go to what cores and at what frequency should that core operate. Various policies, such as greedy, iterative, and probabilistic, can be employed. However, the choice and usage of these policies can influence a heuristic towards a particular objective and can also profoundly impact its performance. This article proposes 16 heuristics that utilize various methods for task-to-core allocation and frequency selection. This article also presents a methodical classification scheme which not only categorizes the proposed heuristics but can also accommodate additional heuristics. Extensive simulation experiments compare these algorithms while shedding light on their strengths and tradeoffs.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Blanchard:2016:SMO, author = "Jeffrey D. Blanchard and Erik Opavsky and Emircan Uysaler", title = "Selecting Multiple Order Statistics with a Graphics Processing Unit", journal = j-TOPC, volume = "3", number = "2", pages = "10:1--10:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2948974", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:52 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Extracting a set of multiple order statistics from a huge data set provides important information about the distribution of the values in the full set of data. This article introduces an algorithm, bucketMultiSelect, for simultaneously selecting multiple order statistics with a graphics processing unit (GPU). Typically, when a large set of order statistics is desired, the vector is sorted. When the sorted version of the vector is not needed, bucketMultiSelect significantly reduces computation time by eliminating a large portion of the unnecessary operations involved in sorting. For large vectors, bucketMultiSelect returns thousands of order statistics in less time than sorting the vector while typically using less memory. For vectors containing $2^{28}$ values of type double, bucketMultiSelect selects the 101 percentile order statistics in less than 95ms and is more than $8 \times $ faster than sorting the vector with a GPU optimized merge sort.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Bohme:2016:IRC, author = "David B{\"o}hme and Markus Geimer and Lukas Arnold and Felix Voigtlaender and Felix Wolf", title = "Identifying the Root Causes of Wait States in Large-Scale Parallel Applications", journal = j-TOPC, volume = "3", number = "2", pages = "11:1--11:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2934661", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:52 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Driven by growing application requirements and accelerated by current trends in microprocessor design, the number of processor cores on modern supercomputers is increasing from generation to generation. However, load or communication imbalance prevents many codes from taking advantage of the available parallelism, as delays of single processes may spread wait states across the entire machine. Moreover, when employing complex point-to-point communication patterns, wait states may propagate along far-reaching cause-effect chains that are hard to track manually and that complicate an assessment of the actual costs of an imbalance. Building on earlier work by Meira, Jr., et al., we present a scalable approach that identifies program wait states and attributes their costs in terms of resource waste to their original cause. By replaying event traces in parallel both forward and backward, we can identify the processes and call paths responsible for the most severe imbalances, even for runs with hundreds of thousands of processes.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Dathathri:2016:CAL, author = "Roshan Dathathri and Ravi Teja Mullapudi and Uday Bondhugula", title = "Compiling Affine Loop Nests for a Dynamic Scheduling Runtime on Shared and Distributed Memory", journal = j-TOPC, volume = "3", number = "2", pages = "12:1--12:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2948975", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:52 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Current de-facto parallel programming models like OpenMP and MPI make it difficult to extract task-level dataflow parallelism as opposed to bulk-synchronous parallelism. Task parallel approaches that use point-to-point synchronization between dependent tasks in conjunction with dynamic scheduling dataflow runtimes are thus becoming attractive. Although good performance can be extracted for both shared and distributed memory using these approaches, there is little compiler support for them. In this article, we describe the design of compiler--runtime interaction to automatically extract coarse-grained dataflow parallelism in affine loop nests for both shared and distributed-memory architectures. We use techniques from the polyhedral compiler framework to extract tasks and generate components of the runtime that are used to dynamically schedule the generated tasks. The runtime includes a distributed decentralized scheduler that dynamically schedules tasks on a node. The schedulers on different nodes cooperate with each other through asynchronous point-to-point communication, and all of this is achieved by code automatically generated by the compiler. On a set of six representative affine loop nest benchmarks, while running on 32 nodes with 8 threads each, our compiler-assisted runtime yields a geometric mean speedup of $ 143.6 \times $ ($ 70.3 \times $ to $ 474.7 \times $) over the sequential version and a geometric mean speedup of $ 1.64 \times $ ($ 1.04 \times $ to $ 2.42 \times $) over the state-of-the-art automatic parallelization approach that uses bulk synchronization. We also compare our system with past work that addresses some of these challenges on shared memory, and an emerging runtime (Intel Concurrent Collections) that demands higher programmer input and effort in parallelizing. To the best of our knowledge, ours is also the first automatic scheme that allows for dynamic scheduling of affine loop nests on a cluster of multicores.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Benoit:2016:AGP, author = "Anne Benoit and Aur{\'e}lien Cavelan and Yves Robert and Hongyang Sun", title = "Assessing General-Purpose Algorithms to Cope with Fail-Stop and Silent Errors", journal = j-TOPC, volume = "3", number = "2", pages = "13:1--13:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2897189", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:52 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "In this article, we combine the traditional checkpointing and rollback recovery strategies with verification mechanisms to cope with both fail-stop and silent errors. The objective is to minimize makespan and/or energy consumption. For divisible load applications, we use first-order approximations to find the optimal checkpointing period to minimize execution time, with an additional verification mechanism to detect silent errors before each checkpoint, hence extending the classical formula by Young and Daly for fail-stop errors only. We further extend the approach to include intermediate verifications, and to consider a bicriteria problem involving both time and energy (linear combination of execution time and energy consumption). Then, we focus on application workflows whose dependence graph is a linear chain of tasks. Here, we determine the optimal checkpointing and verification locations, with or without intermediate verifications, for the bicriteria problem. Rather than using a single speed during the whole execution, we further introduce a new execution scenario, which allows for changing the execution speed via Dynamic Voltage and Frequency Scaling (DVFS). In this latter scenario, we determine the optimal checkpointing and verification locations, as well as the optimal speed pairs for each task segment between any two consecutive checkpoints. Finally, we conduct an extensive set of simulations to support the theoretical study, and to assess the performance of each algorithm, showing that the best overall performance is achieved under the most flexible scenario using intermediate verifications and different speeds.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Koutis:2016:SPD, author = "Ioannis Koutis and Shen Chen Xu", title = "Simple Parallel and Distributed Algorithms for Spectral Graph Sparsification", journal = j-TOPC, volume = "3", number = "2", pages = "14:1--14:??", month = aug, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2948062", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Sep 23 15:24:52 MDT 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We describe simple algorithms for spectral graph sparsification, based on iterative computations of weighted spanners and sampling. Leveraging the algorithms of Baswana and Sen for computing spanners, we obtain the first distributed spectral sparsification algorithm in the CONGEST model. We also obtain a parallel algorithm with improved work and time guarantees, as well as other natural distributed implementations. Combining this algorithm with the parallel framework of Peng and Spielman for solving symmetric diagonally dominant linear systems, we get a parallel solver that is significantly more efficient in terms of the total work.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Dorier:2016:DAP, author = "Matthieu Dorier and Gabriel Antoniu and Franck Cappello and Marc Snir and Robert Sisneros and Orcun Yildiz and Shadi Ibrahim and Tom Peterka and Leigh Orf", title = "{Damaris}: Addressing Performance Variability in Data Management for Post-Petascale Simulations", journal = j-TOPC, volume = "3", number = "3", pages = "15:1--15:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2987371", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Dec 26 17:40:41 MST 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "With exascale computing on the horizon, reducing performance variability in data management tasks (storage, visualization, analysis, etc.) is becoming a key challenge in sustaining high performance. This variability significantly impacts the overall application performance at scale and its predictability over time. In this article, we present Damaris, a system that leverages dedicated cores in multicore nodes to offload data management tasks, including I/O, data compression, scheduling of data movements, in situ analysis, and visualization. We evaluate Damaris with the CM1 atmospheric simulation and the Nek5000 computational fluid dynamic simulation on four platforms, including NICS's Kraken and NCSA's Blue Waters. Our results show that (1) Damaris fully hides the I/O variability as well as all I/O-related costs, thus making simulation performance predictable; (2) it increases the sustained write throughput by a factor of up to 15 compared with standard I/O approaches; (3) it allows almost perfect scalability of the simulation up to over 9,000 cores, as opposed to state-of-the-art approaches that fail to scale; and (4) it enables a seamless connection to the VisIt visualization software to perform in situ analysis and visualization in a way that impacts neither the performance of the simulation nor its variability. In addition, we extended our implementation of Damaris to also support the use of dedicated nodes and conducted a thorough comparison of the two approaches-dedicated cores and dedicated nodes-for I/O tasks with the aforementioned applications.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Gao:2016:AOM, author = "Jiaquan Gao and Yu Wang and Jun Wang and Ronghua Liang", title = "Adaptive Optimization Modeling of Preconditioned Conjugate Gradient on Multi-{GPUs}", journal = j-TOPC, volume = "3", number = "3", pages = "16:1--16:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2990849", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Dec 26 17:40:41 MST 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "The preconditioned conjugate gradient (PCG) algorithm is a well-known iterative method for solving sparse linear systems in scientific computations. GPU-accelerated PCG algorithms for large-sized problems have attracted considerable attention recently. However, on a specific multi-GPU platform, producing a highly parallel PCG implementation for any large-sized problem requires significant time because several manual steps are involved in adjusting the related parameters and selecting an appropriate storage format for the matrix block that is assigned to each GPU. This motivates us to propose adaptive optimization modeling of PCG on multi-GPUs, which mainly involves the following parts: (1) an optimization multi-GPU parallel framework of PCG and (2) the profile-based optimization modeling for each one of the main components of the PCG algorithm, including vector operation, inner product, and sparse matrix-vector multiplication (SpMV). Our model does not construct a new storage format or kernel but automatically and rapidly generates an optimal parallel PCG algorithm for any problem on a specific multi-GPU platform by integrating existing storage formats and kernels. We take a vector operation kernel, an inner-product kernel, and five popular SpMV kernels for an example to present the idea of constructing the model. Given that our model is general, independent of the problems, and dependent on the resources of devices, this model is constructed only once for each type of GPU. The experiments validate the high efficiency of our proposed model.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Creech:2016:TSS, author = "Timothy Creech and Rajeev Barua", title = "Transparently Space Sharing a Multicore Among Multiple Processes", journal = j-TOPC, volume = "3", number = "3", pages = "17:1--17:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3001910", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Dec 26 17:40:41 MST 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "As hardware becomes increasingly parallel and the availability of scalable parallel software improves, the problem of managing multiple multithreaded applications (processes) becomes important. Malleable processes, which can vary the number of threads used as they run, enable sophisticated and flexible resource management. Although many existing applications parallelized for SMPs with parallel runtimes are in fact already malleable, deployed runtime environments provide no interface nor any strategy for intelligently allocating hardware threads or even preventing oversubscription. Prior research methods either depend on profiling applications ahead of time to make good decisions about allocations or do not account for process efficiency at all, leading to poor performance. None of these prior methods have been adapted widely in practice. This article presents the Scheduling and Allocation with Feedback (SCAF) system: a drop-in runtime solution that supports existing malleable applications in making intelligent allocation decisions based on observed efficiency without any changes to semantics, program modification, offline profiling, or even recompilation. Our existing implementation can control most unmodified OpenMP applications. Other malleable threading libraries can also easily be supported with small modifications without requiring application modification or recompilation. In this work, we present the SCAF daemon and a SCAF-aware port of the GNU OpenMP runtime. We present a new technique for estimating process efficiency purely at runtime using available hardware counters and demonstrate its effectiveness in aiding allocation decisions. We evaluated SCAF using NAS NPB parallel benchmarks on five commodity parallel platforms, enumerating architectural features and their effects on our scheme. We measured the benefit of SCAF in terms of sum of speedups improvement (a common metric for multiprogrammed environments) when running all benchmark pairs concurrently compared to equipartitioning-the best existing competing scheme in the literature. We found that SCAF improves on equipartitioning on four out of five machines, showing a mean improvement factor in sum of speedups of 1.04 to 1.11x for benchmark pairs, depending on the machine, and 1.09x on average. Since we are not aware of any widely available tool for equipartitioning, we also compare SCAF against multiprogramming using unmodified OpenMP, which is the only environment available to end users today. SCAF improves on the unmodified OpenMP runtimes for all five machines, with a mean improvement of 1.08 to 2.07x, depending on the machine, and 1.59x on average.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Ballard:2016:HPS, author = "Grey Ballard and Alex Druinsky and Nicholas Knight and Oded Schwartz", title = "Hypergraph Partitioning for Sparse Matrix--Matrix Multiplication", journal = j-TOPC, volume = "3", number = "3", pages = "18:1--18:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3015144", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Dec 26 17:40:41 MST 2016", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We propose a fine-grained hypergraph model for sparse matrix-matrix multiplication (SpGEMM), a key computational kernel in scientific computing and data analysis whose performance is often communication bound. This model correctly describes both the interprocessor communication volume along a critical path in a parallel computation and also the volume of data moving through the memory hierarchy in a sequential computation. We show that identifying a communication-optimal algorithm for particular input matrices is equivalent to solving a hypergraph partitioning problem. Our approach is nonzero structure dependent, meaning that we seek the best algorithm for the given input matrices. In addition to our three-dimensional fine-grained model, we also propose coarse-grained one-dimensional and two-dimensional models that correspond to simpler SpGEMM algorithms. We explore the relations between our models theoretically, and we study their performance experimentally in the context of three applications that use SpGEMM as a key computation. For each application, we find that at least one coarse-grained model is as communication efficient as the fine-grained model. We also observe that different applications have affinities for different algorithms. Our results demonstrate that hypergraphs are an accurate model for reasoning about the communication costs of SpGEMM as well as a practical tool for exploring the SpGEMM algorithm design space.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Grove:2017:ISS, author = "David Grove", title = "Introduction to the Special Section on {PPoPP'15}", journal = j-TOPC, volume = "3", number = "4", pages = "19:1--19:??", month = mar, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3040224", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 25 07:55:06 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Majo:2017:LPC, author = "Zoltan Majo and Thomas R. Gross", title = "A Library for Portable and Composable Data Locality Optimizations for {NUMA} Systems", journal = j-TOPC, volume = "3", number = "4", pages = "20:1--20:??", month = mar, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3040222", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 25 07:55:06 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Many recent multiprocessor systems are realized with a nonuniform memory architecture (NUMA) and accesses to remote memory locations take more time than local memory accesses. Optimizing NUMA memory system performance is difficult and costly for three principal reasons: (1) Today's programming languages/libraries have no explicit support for NUMA systems, (2) NUMA optimizations are not portable, and (3) optimizations are not composable (i.e., they can become ineffective or worsen performance in environments that support composable parallel software). This article presents TBB-NUMA, a parallel programming library based on Intel Threading Building Blocks (TBB) that supports portable and composable NUMA-aware programming. TBB-NUMA provides a model of task affinity that captures a programmer's insights on mapping tasks to resources. NUMA-awareness affects all layers of the library (i.e., resource management, task scheduling, and high-level parallel algorithm templates) and requires close coupling between all these layers. Optimizations implemented with TBB-NUMA (for a set of standard benchmark programs) result in up to 44\% performance improvement over standard TBB. But more important, optimized programs are portable across different NUMA architectures and preserve data locality also when composed with other parallel computations sharing the same resource management layer.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Golan-Gueta:2017:ASA, author = "Guy Golan-Gueta and G. Ramalingam and Mooly Sagiv and Eran Yahav", title = "Automatic Scalable Atomicity via Semantic Locking", journal = j-TOPC, volume = "3", number = "4", pages = "21:1--21:??", month = mar, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3040223", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 25 07:55:06 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "In this article, we consider concurrent programs in which the shared state consists of instances of linearizable abstract data types (ADTs). We present an automated approach to concurrency control that addresses a common need: the need to atomically execute a code fragment, which may contain multiple ADT operations on multiple ADT instances. We present a synthesis algorithm that automatically enforces atomicity of given code fragments (in a client program) by inserting pessimistic synchronization that guarantees atomicity and deadlock-freedom (without using any rollback mechanism). Our algorithm takes a commutativity specification as an extra input. This specification indicates for every pair of ADT operations the conditions under which the operations commute. Our algorithm enables greater parallelism by permitting commuting operations to execute concurrently. We have implemented the synthesis algorithm in a Java compiler and applied it to several Java programs. Our results show that our approach produces efficient and scalable synchronization.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Izraelevitz:2017:GSN, author = "Joseph Izraelevitz and Michael L. Scott", title = "Generality and Speed in Nonblocking Dual Containers", journal = j-TOPC, volume = "3", number = "4", pages = "22:1--22:??", month = mar, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3040220", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 25 07:55:06 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Nonblocking dual data structures extend traditional notions of nonblocking progress to accommodate partial methods, both by bounding the number of steps that a thread can execute after its preconditions have been satisfied and by ensuring that a waiting thread performs no remote memory accesses that could interfere with the execution of other threads. A nonblocking dual container, in particular, is designed to hold either data or requests. An insert operation either adds data to the container or removes and satisfies a request; a remove operation either takes data out of the container or inserts a request. We present the first general-purpose construction for nonblocking dual containers, allowing any nonblocking container for data to be paired with almost any nonblocking container for requests. We also present new custom algorithms, based on the LCRQ of Morrison and Afek, that outperform the fastest previously known dual containers by factors of four to six.", acknowledgement = ack-nhfb, articleno = "22", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Cole:2017:ROS, author = "Richard Cole and Vijaya Ramachandran", title = "Resource Oblivious Sorting on Multicores", journal = j-TOPC, volume = "3", number = "4", pages = "23:1--23:??", month = mar, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3040221", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sat Mar 25 07:55:06 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present a deterministic sorting algorithm, Sample, Partition, and Merge Sort (SPMS), that interleaves the partitioning of a sample sort with merging. Sequentially, it sorts $n$ elements in $ O(n \log n)$ time cache-obliviously with an optimal number of cache misses. The parallel complexity (or critical path length) of the algorithm is $ O(\log n \log \log n)$, which improves on previous bounds for deterministic sample sort. The algorithm also has low false sharing costs. When scheduled by a work-stealing scheduler in a multicore computing environment with a global shared memory and p cores, each having a cache of size $M$ organized in blocks of size $B$, the costs of the additional cache misses and false sharing misses due to this parallel execution are bounded by the cost of $ O(S \cdot M / B)$ and $ O(S \cdot B)$ cache misses, respectively, where $S$ is the number of steals performed during the execution. Finally, SPMS is resource oblivious in that the dependence on machine parameters appear only in the analysis of its performance and not within the algorithm itself.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Ballard:2017:GEIa, author = "Grey Ballard and Mary Hall and Tim Harris and Brandon Lucia", title = "{Guest Editor} Introduction {PPoPP 2016}, Special Issue 2 of 2", journal = j-TOPC, volume = "4", number = "1", pages = "1:1--1:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3108141", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Ashkiani:2017:GME, author = "Saman Ashkiani and Andrew Davidson and Ulrich Meyer and John D. Owens", title = "{GPU Multisplit}: an Extended Study of a Parallel Algorithm", journal = j-TOPC, volume = "4", number = "1", pages = "2:1--2:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3108139", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Multisplit is a broadly useful parallel primitive that permutes its input data into contiguous buckets or bins, where the function that categorizes an element into a bucket is provided by the programmer. Due to the lack of an efficient multisplit on Graphics Processing Units (GPUs), programmers often choose to implement multisplit with a sort. One way is to first generate an auxiliary array of bucket IDs and then sort input data based on it. In case smaller indexed buckets possess smaller valued keys, another way for multisplit is to directly sort input data. Both methods are inefficient and require more work than necessary: The former requires more expensive data movements while the latter spends unnecessary effort in sorting elements within each bucket. In this work, we provide a parallel model and multiple implementations for the multisplit problem. Our principal focus is multisplit for a small (up to 256) number of buckets. We use warp-synchronous programming models and emphasize warpwide communications to avoid branch divergence and reduce memory usage. We also hierarchically reorder input elements to achieve better coalescing of global memory accesses. On a GeForce GTX 1080 GPU, we can reach a peak throughput of 18.93Gkeys/s (or 11.68Gpairs/s) for a key-only (or key-value) multisplit. Finally, we demonstrate how multisplit can be used as a building block for radix sort. In our multisplit-based sort implementation, we achieve comparable performance to the fastest GPU sort routines, sorting 32-bit keys (and key-value pairs) with a throughput of 3.0Gkeys/s (and 2.1Gpair/s).", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Wang:2017:GGG, author = "Yangzihao Wang and Yuechao Pan and Andrew Davidson and Yuduo Wu and Carl Yang and Leyuan Wang and Muhammad Osama and Chenshan Yuan and Weitang Liu and Andy T. Riffel and John D. Owens", title = "{Gunrock}: {GPU} Graph Analytics", journal = j-TOPC, volume = "4", number = "1", pages = "3:1--3:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3108140", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "For large-scale graph analytics on the GPU, the irregularity of data access and control flow, and the complexity of programming GPUs, have presented two significant challenges to developing a programmable high-performance graph library. ``Gunrock,'' our graph-processing system designed specifically for the GPU, uses a high-level, bulk-synchronous, data-centric abstraction focused on operations on a vertex or edge frontier. Gunrock achieves a balance between performance and expressiveness by coupling high-performance GPU computing primitives and optimization strategies with a high-level programming model that allows programmers to quickly develop new graph primitives with small code size and minimal GPU programming knowledge. We characterize the performance of various optimization strategies and evaluate Gunrock's overall performance on different GPU architectures on a wide range of graph primitives that span from traversal-based algorithms and ranking algorithms, to triangle counting and bipartite-graph-based algorithms. The results show that on a single GPU, Gunrock has on average at least an order of magnitude speedup over Boost and PowerGraph, comparable performance to the fastest GPU hardwired primitives and CPU shared-memory graph libraries, such as Ligra and Galois, and better performance than any other GPU high-level graph library.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Chowdhury:2017:AAD, author = "Rezaul Chowdhury and Pramod Ganapathi and Stephen Tschudi and Jesmin Jahan Tithi and Charles Bachmeier and Charles E. Leiserson and Armando Solar-Lezama and Bradley C. Kuszmaul and Yuan Tang", title = "{Autogen}: Automatic Discovery of Efficient Recursive Divide-\&-Conquer Algorithms for Solving Dynamic Programming Problems", journal = j-TOPC, volume = "4", number = "1", pages = "4:1--4:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3125632", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present Autogen --- an algorithm that for a wide class of dynamic programming (DP) problems automatically discovers highly efficient cache-oblivious parallel recursive divide-and-conquer algorithms from inefficient iterative descriptions of DP recurrences. Autogen analyzes the set of DP table locations accessed by the iterative algorithm when run on a DP table of small size and automatically identifies a recursive access pattern and a corresponding provably correct recursive algorithm for solving the DP recurrence. We use Autogen to autodiscover efficient algorithms for several well-known problems. Our experimental results show that several autodiscovered algorithms significantly outperform parallel looping and tiled loop-based algorithms. Also, these algorithms are less sensitive to fluctuations of memory and bandwidth compared with their looping counterparts, and their running times and energy profiles remain relatively more stable. To the best of our knowledge, Autogen is the first algorithm that can automatically discover new nontrivial divide-and-conquer algorithms.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Steele:2017:AAC, author = "Guy L. {Steele Jr.} and Jean-Baptiste Tristan", title = "Adding Approximate Counters", journal = j-TOPC, volume = "4", number = "1", pages = "5:1--5:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3132167", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We describe a general framework for adding the values of two approximate counters to produce a new approximate counter value whose expected estimated value is equal to the sum of the expected estimated values of the given approximate counters. (To the best of our knowledge, this is the first published description of any algorithm for adding two approximate counters.) We then work out implementation details for five different kinds of approximate counter and provide optimized pseudocode. For three of them, we present proofs that the variance of a counter value produced by adding two counter values in this way is bounded, and in fact is no worse, or not much worse, than the variance of the value of a single counter to which the same total number of increment operations have been applied. Addition of approximate counters is useful in massively parallel divide-and-conquer algorithms that use a distributed representation for large arrays of counters. We describe two machine-learning algorithms for topic modeling that use millions of integer counters and confirm that replacing the integer counters with approximate counters is effective, speeding up a GPU-based implementation by over 65\% and a CPU-based implementation by nearly 50\%, as well as reducing memory requirements, without degrading their statistical effectiveness.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Ballard:2017:GEIb, author = "Grey Ballard and Mary Hall and Tim Harris and Brandon Lucia", title = "{Guest Editor} Introduction {PPoPP 2016}, Special Issue 2 of 2", journal = j-TOPC, volume = "4", number = "2", pages = "6:1--6:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3108142", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Kalikar:2017:DNM, author = "Saurabh Kalikar and Rupesh Nasre", title = "{DomLock}: a New Multi-Granularity Locking Technique for Hierarchies", journal = j-TOPC, volume = "4", number = "2", pages = "7:1--7:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3127584", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We present efficient locking mechanisms for hierarchical data structures. Several applications work on an abstract hierarchy of objects, and a parallel execution on this hierarchy necessitates synchronization across workers operating on different parts of the hierarchy. Existing synchronization mechanisms are too coarse, too inefficient, or too ad hoc, resulting in reduced or unpredictable amount of concurrency. We propose a new locking approach based on the structural properties of the underlying hierarchy. We show that the developed techniques are efficient even when the hierarchy is an arbitrary graph. Theoretically, we present our approach as a locking-cost-minimizing instance of a generic algebraic model of synchronization for hierarchies. Using STMBench7, we illustrate considerable reduction in the locking cost, resulting in an average throughput improvement of 42\%.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Haider:2017:LRA, author = "Syed Kamran Haider and William Hasenplaugh and Dan Alistarh", title = "{Lease\slash Release}: Architectural Support for Scaling Contended Data Structures", journal = j-TOPC, volume = "4", number = "2", pages = "8:1--8:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3132168", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "High memory contention is generally agreed to be a worst-case scenario for concurrent data structures. There has been a significant amount of research effort spent investigating designs that minimize contention, and several programming techniques have been proposed to mitigate its effects. However, there are currently few architectural mechanisms to allow scaling contended data structures at high thread counts. In this article, we investigate hardware support for scalable contended data structures. We propose Lease/Release, a simple addition to standard directory-based MESI cache coherence protocols, allowing participants to lease memory, at the granularity of cache lines, by delaying coherence messages for a short, bounded period of time. Our analysis shows that Lease/Release can significantly reduce the overheads of contention for both non-blocking (lock-free) and lock-based data structure implementations while ensuring that no deadlocks are introduced. We validate Lease/Release empirically on the Graphite multiprocessor simulator on a range of data structures, including queue, stack, and priority queue implementations, as well as on transactional applications. Results show that Lease/Release consistently improves both throughput and energy usage, by up to 5x, both for lock-free and lock-based data structure designs.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Cao:2017:HRD, author = "Man Cao and Minjia Zhang and Aritra Sengupta and Swarnendu Biswas and Michael D. Bond", title = "Hybridizing and Relaxing Dependence Tracking for Efficient Parallel Runtime Support", journal = j-TOPC, volume = "4", number = "2", pages = "9:1--9:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3108138", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "It is notoriously challenging to develop parallel software systems that are both scalable and correct. Runtime support for parallelism-such as multithreaded record and replay, data race detectors, transactional memory, and enforcement of stronger memory models-helps achieve these goals, but existing commodity solutions slow programs substantially to track (i.e., detect or control) an execution's cross-thread dependencies accurately. Prior work tracks cross-thread dependencies either ``pessimistically,'' slowing every program access, or ``optimistically,'' allowing for lightweight instrumentation of most accesses but dramatically slowing accesses that are conflicting (i.e., involved in cross-thread dependencies). This article presents two novel approaches that seek to improve the performance of dependence tracking. Hybrid tracking (HT) hybridizes pessimistic and optimistic tracking by overcoming a fundamental mismatch between these two kinds of tracking. HT uses an adaptive, profile-based policy to make runtime decisions about switching between pessimistic and optimistic tracking. Relaxed tracking (RT) attempts to reduce optimistic tracking's overhead on conflicting accesses by tracking dependencies in a ``relaxed'' way-meaning that not all dependencies are tracked accurately-while still preserving both program semantics and runtime support's correctness. To demonstrate the usefulness and potential of HT and RT, we build runtime support based on the two approaches. Our evaluation shows that both approaches offer performance advantages over existing approaches, but there exist challenges and opportunities for further improvement. HT and RT are distinct solutions to the same problem. It is easier to build runtime support based on HT than on RT, although RT does not incur the overhead of online profiling. This article presents the two approaches together to inform and inspire future designs for efficient parallel runtime support.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Chatzopoulos:2017:EES, author = "Georgios Chatzopoulos and Aleksandar Dragojevi{\'c} and Rachid Guerraoui", title = "{ESTIMA}: Extrapolating {ScalabiliTy} of In-Memory Applications", journal = j-TOPC, volume = "4", number = "2", pages = "10:1--10:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3108137", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "This article presents estima, an easy-to-use tool for extrapolating the scalability of in-memory applications. estima is designed to perform a simple yet important task: Given the performance of an application on a small machine with a handful of cores, estima extrapolates its scalability to a larger machine with more cores, while requiring minimum input from the user. The key idea underlying estima is the use of stalled cycles (e.g., cycles that the processor spends waiting for missed cache line fetches or busy locks). estima measures stalled cycles on a few cores and extrapolates them to more cores, estimating the amount of waiting in the system. estima can be effectively used to predict the scalability of in-memory applications for bigger execution machines. For instance, using measurements of memcached and SQLite on a desktop machine, we obtain accurate predictions of their scalability on a server. Our extensive evaluation shows the effectiveness of estima on a large number of in-memory benchmarks.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Gulisano:2017:EDS, author = "Vincenzo Gulisano and Yiannis Nikolakopoulos and Daniel Cederman and Marina Papatriantafilou and Philippas Tsigas", title = "Efficient Data Streaming Multiway Aggregation through Concurrent Algorithmic Designs and New Abstract Data Types", journal = j-TOPC, volume = "4", number = "2", pages = "11:1--11:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3131272", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Data streaming relies on continuous queries to process unbounded streams of data in a real-time fashion. It is commonly demanding in computation capacity, given that the relevant applications involve very large volumes of data. Data structures act as articulation points and maintain the state of data streaming operators, potentially supporting high parallelism and balancing the work among them. Prompted by this fact, in this work we study and analyze parallelization needs of these articulation points, focusing on the problem of streaming multiway aggregation, where large data volumes are received from multiple input streams. The analysis of the parallelization needs, as well as of the use and limitations of existing aggregate designs and their data structures, leads us to identify needs for appropriate shared objects that can achieve low-latency and high-throughput multiway aggregation. We present the requirements of such objects as abstract data types and we provide efficient lock-free linearizable algorithmic implementations of them, along with new multiway aggregate algorithmic designs that leverage them, supporting both deterministic order-sensitive and order-insensitive aggregate functions. Furthermore, we point out future directions that open through these contributions. The article includes an extensive experimental study, based on a variety of continuous aggregation queries on two large datasets extracted from SoundCloud, a music social network, and from a Smart Grid network. In all the experiments, the proposed data structures and the enhanced aggregate operators improved the processing performance significantly, up to one order of magnitude, in terms of both throughput and latency, over the commonly used techniques based on queues.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Malas:2018:MIP, author = "Tareq M. Malas and Georg Hager and Hatem Ltaief and David E. Keyes", title = "Multidimensional Intratile Parallelization for Memory-Starved Stencil Computations", journal = j-TOPC, volume = "4", number = "3", pages = "12:1--12:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3155290", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Optimizing the performance of stencil algorithms has been the subject of intense research over the last two decades. Since many stencil schemes have low arithmetic intensity, most optimizations focus on increasing the temporal data access locality, thus reducing the data traffic through the main memory interface with the ultimate goal of decoupling from this bottleneck. There are, however, only a few approaches that explicitly leverage the shared cache feature of modern multicore chips. If every thread works on its private, separate cache block, the available cache space can become too small, and sufficient temporal locality may not be achieved. We propose a flexible multidimensional intratile parallelization method for stencil algorithms on multicore CPUs with a shared outer-level cache. This method leads to a significant reduction in the required cache space without adverse effects from hardware prefetching or TLB shortage. Our Girih framework includes an autotuner to select optimal parameter configurations on the target hardware. We conduct performance experiments on two contemporary Intel processors and compare with the state-of-the-art stencil frameworks Pluto and Pochoir, using four corner-case stencil schemes and a wide range of problem sizes. Girih shows substantial performance advantages and best arithmetic intensity at almost all problem sizes, especially on low-intensity stencils with variable coefficients. We study in detail the performance behavior at varying grid sizes using phenomenological performance modeling. Our analysis of energy consumption reveals that our method can save energy through reduced DRAM bandwidth usage even at a marginal performance gain. It is thus well suited for future architectures that will be strongly challenged by the cost of data movement, be it in terms of performance or energy consumption.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Akbudak:2018:PMS, author = "Kadir Akbudak and Oguz Selvitopi and Cevdet Aykanat", title = "Partitioning Models for Scaling Parallel Sparse Matrix--Matrix Multiplication", journal = j-TOPC, volume = "4", number = "3", pages = "13:1--13:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3155292", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We investigate outer-product--parallel, inner-product--parallel, and row-by-row-product--parallel formulations of sparse matrix-matrix multiplication (SpGEMM) on distributed memory architectures. For each of these three formulations, we propose a hypergraph model and a bipartite graph model for distributing SpGEMM computations based on one-dimensional (1D) partitioning of input matrices. We also propose a communication hypergraph model for each formulation for distributing communication operations. The computational graph and hypergraph models adopted in the first phase aim at minimizing the total message volume and balancing the computational loads of processors, whereas the communication hypergraph models adopted in the second phase aim at minimizing the total message count and balancing the message volume loads of processors. That is, the computational partitioning models reduce the bandwidth cost and the communication hypergraph models reduce the latency cost. Our extensive parallel experiments on up to 2048 processors for a wide range of realistic SpGEMM instances show that although the outer-product--parallel formulation scales better, the row-by-row-product--parallel formulation is more viable due to its significantly lower partitioning overhead and competitive scalability. For computational partitioning models, our experimental findings indicate that the proposed bipartite graph models are attractive alternatives to their hypergraph counterparts because of their lower partitioning overhead. Finally, we show that by reducing the latency cost besides the bandwidth cost through using the communication hypergraph models, the parallel SpGEMM time can be further improved up to 32\%.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Bilardi:2018:LBT, author = "Gianfranco Bilardi and Michele Scquizzato and Francesco Silvestri", title = "A Lower Bound Technique for Communication in {BSP}", journal = j-TOPC, volume = "4", number = "3", pages = "14:1--14:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3181776", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Communication is a major factor determining the performance of algorithms on current computing systems; it is therefore valuable to provide tight lower bounds on the communication complexity of computations. This article presents a lower bound technique for the communication complexity in the bulk-synchronous parallel (BSP) model of a given class of DAG computations. The derived bound is expressed in terms of the switching potential of a DAG, that is, the number of permutations that the DAG can realize when viewed as a switching network. The proposed technique yields tight lower bounds for the fast Fourier transform (FFT), and for any sorting and permutation network. A stronger bound is also derived for the periodic balanced sorting network, by applying this technique to suitable subnetworks. Finally, we demonstrate that the switching potential captures communication requirements even in computational models different from BSP, such as the I/O model and the LPRAM.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Sahin:2018:CSC, author = "Semih Sahin and Bugra Gedik", title = "{C-Stream}: a Co-routine-Based Elastic Stream Processing Engine", journal = j-TOPC, volume = "4", number = "3", pages = "15:1--15:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3184120", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Stream processing is a computational paradigm for on-the-fly processing of live data. This paradigm lends itself to implementations that can provide high throughput and low latency by taking advantage of various forms of parallelism that are naturally captured by the stream processing model of computation, such as pipeline, task, and data parallelism. In this article, we describe the design and implementation of C-Stream, which is an elastic stream processing engine. C-Stream encompasses three unique properties. First, in contrast to the widely adopted event-based interface for developing streaming operators, C-Stream provides an interface wherein each operator has its own driver loop and relies on data availability application programming interfaces (APIs) to decide when to perform its computations. This self-control-based model significantly simplifies the development of operators that require multiport synchronization. Second, C-Stream contains a dynamic scheduler that manages the multithreaded execution of the operators. The scheduler, which is customizable via plug-ins, enables the execution of the operators as co-routines, using any number of threads. The base scheduler implements back-pressure, provides data availability APIs, and manages preemption and termination handling. Last, C-Stream varies the degree of parallelism to resolve bottlenecks by both dynamically changing the number of threads used to execute an application and adjusting the number of replicas of data-parallel operators. We provide an experimental evaluation of C-Stream. The results show that C-Stream is scalable, highly customizable, and can resolve bottlenecks by dynamically adjusting the level of data parallelism used.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Agrawal:2018:ISI, author = "Kunal Agrawal and I-Ting Angelina Lee and Michael Spear", title = "Introduction to Special Issue on {SPAA'15}", journal = j-TOPC, volume = "4", number = "4", pages = "16:1--16:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3226041", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Ahn:2018:ADN, author = "Kook Jin Ahn and Sudipto Guha", title = "Access to Data and Number of Iterations: Dual Primal Algorithms for Maximum Matching under Resource Constraints", journal = j-TOPC, volume = "4", number = "4", pages = "17:1--17:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3154855", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "In this article, we consider graph algorithms in models of computation where the space usage (random accessible storage, in addition to the read-only input) is sublinear in the number of edges $m$ and the access to input is constrained. These questions arise in many natural settings, and in particular in the analysis of streaming algorithms, MapReduce or similar algorithms, or message passing distributed computing that model constrained parallelism with sublinear central processing. We focus on weighted nonbipartite maximum matching in this article. For any constant $ p > 1$, we provide an iterative sampling-based algorithm for computing a $ (1 - \epsilon)$-approximation of the weighted nonbipartite maximum matching that uses $ O(p / \epsilon)$ rounds of sampling, and $ O(n^{1 + 1 / p})$ space. The results extend to $b$-Matching with small changes. This article combines adaptive sketching literature and fast primal-dual algorithms based on relaxed Dantzig--Wolfe decision procedures. Each round of sampling is implemented through linear sketches and can be executed in a single round of streaming or two rounds of MapReduce. The article also proves that nonstandard linear relaxations of a problem, in particular penalty-based formulations, are helpful in reducing the adaptive dependence of the iterations.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Alistarh:2018:TAS, author = "Dan Alistarh and William Leiserson and Alexander Matveev and Nir Shavit", title = "{ThreadScan}: Automatic and Scalable Memory Reclamation", journal = j-TOPC, volume = "4", number = "4", pages = "18:1--18:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3201897", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "The concurrent memory reclamation problem is that of devising a way for a deallocating thread to verify that no other concurrent threads hold references to a memory block being deallocated. To date, in the absence of automatic garbage collection, there is no satisfactory solution to this problem; existing tracking methods like hazard pointers, reference counters, or epoch-based techniques like RCU are either prohibitively expensive or require significant programming expertise to the extent that implementing them efficiently can be worthy of a publication. None of the existing techniques are automatic or even semi-automated. In this article, we take a new approach to concurrent memory reclamation. Instead of manually tracking access to memory locations as done in techniques like hazard pointers, or restricting shared accesses to specific epoch boundaries as in RCU, our algorithm, called ThreadScan, leverages operating system signaling to automatically detect which memory locations are being accessed by concurrent threads. Initial empirical evidence shows that ThreadScan scales surprisingly well and requires negligible programming effort beyond the standard use of Malloc and Free.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Dimitrov:2018:RDT, author = "Dimitar Dimitrov and Martin Vechev and Vivek Sarkar", title = "Race Detection in Two Dimensions", journal = j-TOPC, volume = "4", number = "4", pages = "19:1--19:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3264618", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Dynamic race detection is a program analysis technique for detecting errors caused by undesired interleavings of concurrent tasks. A primary challenge when designing efficient race detection algorithms is to achieve manageable space requirements. State-of-the-art algorithms for unstructured parallelism require $ \Theta (n) $ space per monitored memory location, where n is the total number of tasks. This is a serious drawback when analyzing programs with many tasks. In contrast, algorithms for programs with a series-parallel (SP) structure require only $ \Theta (1) $ space. Unfortunately, it is currently not well understood if there are classes of parallelism beyond SP that can also benefit from and be analyzed with $ \Theta (1) $ space complexity. In this work, we show that structures richer than SP graphs, namely, that of two-dimensional (2D) lattices, can also be analyzed in $ \Theta (1) $ space. Toward that (a) we extend Tarjan's algorithm for finding lowest common ancestors to handle 2D lattices; (b) from that extension we derive a serial algorithm for race detection that can analyze arbitrary task graphs with a 2D lattice structure; (c) we present a restriction to fork-join that admits precisely the 2D lattices as task graphs (e.g., it can express pipeline parallelism). Our work generalizes prior work on structured race detection and aims to provide a deeper understanding of the interplay between structured parallelism and program analysis efficiency.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Lee:2018:ERD, author = "I-Ting Angelina Lee and Tao B. Schardl", title = "Efficient Race Detection for Reducer Hyperobjects", journal = j-TOPC, volume = "4", number = "4", pages = "20:1--20:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3205914", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "A multithreaded Cilk program that is ostensibly deterministic may nevertheless behave nondeterministically due to programming errors in the code. For a Cilk program that uses reducers-a general reduction mechanism supported in various Cilk dialects-such programming errors are especially challenging to debug, because the errors can expose the nondeterminism in how the Cilk runtime system manages reducers. We identify two unique types of races that arise from incorrect use of reducers in a Cilk program, and we present two algorithms to catch these races. The first algorithm, called the Peer-Set algorithm, detects view-read races, which occur when the program attempts to retrieve a value out of a reducer when the read may result in a nondeterministic value, such as before all previously spawned subcomputations that might update the reducer have necessarily returned. The second algorithm, called the SP+ algorithm, detects determinacy races-instances where a write to a memory location occurs logically in parallel with another access to that location-even when the raced-on memory locations relate to reducers. Both algorithms are provably correct, asymptotically efficient, and can be implemented efficiently in practice. We have implemented both algorithms in our prototype race detector, Rader. When running Peer-Set, Rader incurs a geometric-mean multiplicative overhead of 2.56 over running the benchmark without instrumentation. When running SP+, Rader incurs a geometric-mean multiplicative overhead of 16.94.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Gilbert:2018:ISI, author = "Seth Gilbert", title = "Introduction to the Special Issue for {SPAA 2016}", journal = j-TOPC, volume = "5", number = "1", pages = "1:1--1:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3230677", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Mitzenmacher:2018:BBC, author = "Michael Mitzenmacher and Rajmohan Rajaraman and Scott Roche", title = "Better Bounds for Coalescing-Branching Random Walks", journal = j-TOPC, volume = "5", number = "1", pages = "2:1--2:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209688", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Coalescing-branching random walks, or cobra walks for short, are a natural variant of random walks on graphs that can model the spread of disease through contacts or the spread of information in networks. In a k -cobra walk, at each timestep, a subset of the vertices are active; each active vertex chooses k random neighbors (sampled independently and uniformly with replacement) that become active at the next step, and these are the only active vertices at the next step. A natural quantity to study for cobra walks is the cover time, which corresponds to the expected time when all nodes have become infected or received the disseminated information. In this article, we extend previous results for cobra walks in multiple ways. We show that the cover time for the 2-cobra walk on $ [0, n]^d $ is $ O(n) $ (where the order notation hides constant factors that depend on $d$); previous work had shown the cover time was $ O(n \cdot \polylog (n))$. We show that the cover time for a 2-cobra walk on an $n$-vertex $d$ regular graph with conductance $ \phi_G$ is $ O(d^4 \phis^{-2}_G \log^2 n)$, significantly generalizing a previous result that held only for expander graphs with sufficiently high expansion. And, finally, we show that the cover time for a 2-cobra walk on a graph with n vertices and m edges is always $ O(m n^{3 / 4} \log n)$; this is the first result showing that the bound of $ \Theta (n^3)$ for the worst-case cover time for random walks can be beaten using 2-cobra walks.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Liu:2018:RAN, author = "Mingmou Liu and Xiaoyin Pan and Yitong Yin", title = "Randomized Approximate Nearest Neighbor Search with Limited Adaptivity", journal = j-TOPC, volume = "5", number = "1", pages = "3:1--3:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209884", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We study the complexity of parallel data structures for approximate nearest neighbor search in $d$-dimensional Hamming space $ \{ 0, 1 \}^d$. A classic model for static data structures is the cell-probe model [27]. We consider a cell-probe model with limited adaptivity, where given a $ k \geq 1$, a query is resolved by making at most $k$ rounds of parallel memory accesses to the data structure. We give two randomized algorithms that solve the approximate nearest neighbor search using $k$ rounds of parallel memory accesses: --- a simple algorithm with $ O(k (\log d)^{1 / k})$ total number of memory accesses for all $ k \geq 1$ --- an algorithm with $ O(k + (1 / k \log d)^{O(1 / k)})$ total number of memory accesses for all sufficiently large $k$. Both algorithms use data structures of polynomial size. We prove an $ \Omega (1 / k (\log d)^{1 / k})$ lower bound for the total number of memory accesses for any randomized algorithm solving the approximate nearest neighbor search within $ k \leq \log \log d / 2 \log \log \log d$ rounds of parallel memory accesses on any data structures of polynomial size. This lower bound shows that our first algorithm is asymptotically optimal when $ k = O(1)$. And our second algorithm achieves the asymptotically optimal tradeoff between number of rounds and total number of memory accesses. In the extremal case, when $ k = O(\log \log d / \log \log \log d)$ is big enough, our second algorithm matches the $ \Theta (\log \log d / \log \log \log d)$ tight bound for fully adaptive algorithms for approximate nearest neighbor search in [11].", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Pandurangan:2018:FDA, author = "Gopal Pandurangan and Peter Robinson and Michele Scquizzato", title = "Fast Distributed Algorithms for Connectivity and {MST} in Large Graphs", journal = j-TOPC, volume = "5", number = "1", pages = "4:1--4:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209689", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Motivated by the increasing need to understand the algorithmic foundations of distributed large-scale graph computations, we study a number of fundamental graph problems in a message-passing model for distributed computing where $ k \geq 2 $ machines jointly perform computations on graphs with $n$ nodes (typically, $ n \gg k$). The input graph is assumed to be initially randomly partitioned among the $k$ machines, a common implementation in many real-world systems. Communication is point-to-point, and the goal is to minimize the number of communication rounds of the computation. Our main result is an (almost) optimal distributed randomized algorithm for graph connectivity. Our algorithm runs in $ {\~ O}(n / k^2)$ rounds ($ {\~ O}$ notation hides a $ \polylog (n)$ factor and an additive $ \polylog (n)$ term). This improves over the best previously known bound of $ {\~ O}(n / k)$ [Klauck et al., SODA 2015] and is optimal (up to a polylogarithmic factor) in light of an existing lower bound of $ \Omega^~(n / k^2)$. Our improved algorithm uses a bunch of techniques, including linear graph sketching, that prove useful in the design of efficient distributed graph algorithms. Using the connectivity algorithm as a building block, we then present fast randomized algorithms for computing minimum spanning trees, (approximate) min-cuts, and for many graph verification problems. All these algorithms take $ {\~ O}(n / k^2)$ rounds and are optimal up to polylogarithmic factors. We also show an almost matching lower bound of $ \Omega^~(n / k^2)$ rounds for many graph verification problems by leveraging lower bounds in random-partition communication complexity.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Korupolu:2018:RPF, author = "Madhukar Korupolu and Rajmohan Rajaraman", title = "Robust and Probabilistic Failure-Aware Placement", journal = j-TOPC, volume = "5", number = "1", pages = "5:1--5:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3210367", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Motivated by the growing complexity and heterogeneity of modern data centers, and the prevalence of commodity component failures, this article studies the failure-aware placement problem of placing tasks of a parallel job on machines in the data center with the goal of increasing availability. We consider two models of failures: adversarial and probabilistic. In the adversarial model, each node has a weight (higher weight implying higher reliability) and the adversary can remove any subset of nodes of total weight at most a given bound W and our goal is to find a placement that incurs the least disruption against such an adversary. In the probabilistic model, each node has a probability of failure and we need to find a placement that maximizes the probability that at least K out of N tasks survive at any time. For adversarial failures, we first show that (i) the problems are in $ \Sigma_2 $, the second level of the polynomial hierarchy; (ii) a variant of the problem that we call RobustFap (for Robust Failure-Aware Placement) is co-NP-hard; and (iii) an all-or-nothing version of RobustFap is $ \Sigma_2$-complete. We then give a polynomial-time approximation scheme (PTAS) for RobustFap, a key ingredient of which is a solution that we design for a fractional version of RobustFap. We then study HierRobustFap, which is the fractional RobustFap problem over a hierarchical network, in which failures can occur at any subset of nodes in the hierarchy, and a failure at a node can adversely impact all of its descendants in the hierarchy. To solve HierRobustFap, we introduce a notion of hierarchical max-min fairness and a novel Generalized Spreading algorithm, which is simultaneously optimal for every upper bound W on the total weight of nodes that an adversary can fail. These generalize the classical notion of max-min fairness to work with nodes of differing capacities, differing reliability weights, and hierarchical structures. Using randomized rounding, we extend this to give an algorithm for integral HierRobustFap. For the probabilistic version, we first give an algorithm that achieves an additive $ \epsilon $ approximation in the failure probability for the single level version, called ProbFap, while giving up a $ (1 + \epsilon)$ multiplicative factor in the number of failures. We then extend the result to the hierarchical version, HierProbFap, achieving an \epsilon additive approximation in failure probability while giving up an $ (L + \epsilon)$ multiplicative factor in the number of failures, where $L$ is the number of levels in the hierarchy.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Zhang:2018:LFT, author = "Deli Zhang and Pierre Laborde and Lance Lebanoff and Damian Dechev", title = "Lock-Free Transactional Transformation for Linked Data Structures", journal = j-TOPC, volume = "5", number = "1", pages = "6:1--6:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209690", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/hash.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Nonblocking data structures allow scalable and thread-safe access to shared data. They provide individual operations that appear to execute atomically. However, it is often desirable to execute multiple operations atomically in a transactional manner. Previous solutions, such as Software Transactional Memory (STM) and transactional boosting, manage transaction synchronization separately from the underlying data structure's thread synchronization. Although this reduces programming effort, it leads to overhead associated with additional synchronization and the need to rollback aborted transactions. In this work, we present a new methodology for transforming high-performance lock-free linked data structures into high-performance lock-free transactional linked data structures without revamping the data structures' original synchronization design. Our approach leverages the semantic knowledge of the data structure to eliminate the overhead of false conflicts and rollbacks. We encapsulate all operations, operands, and transaction status in a transaction descriptor, which is shared among the nodes accessed by the same transaction. We coordinate threads to help finish the remaining operations of delayed transactions based on their transaction descriptors. When a transaction fails, we recover the correct abstract state by reversely interpreting the logical status of a node. We also present an obstruction-free version of our algorithm that can be applied to dynamic execution scenarios and an example of our approach applied to a hash map. In our experimental evaluation using transactions with randomly generated operations, our lock-free transactional data structures outperform the transactional boosted ones by 70\% on average. They also outperform the alternative STM-based approaches by a factor of 2 to 13 across all scenarios. More importantly, we achieve 4,700 to 915,000 times fewer spurious aborts than the alternatives.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Muller:2018:NHP, author = "Michel M{\"u}ller and Takayuki Aoki", title = "New High Performance {GPGPU} Code Transformation Framework Applied to Large Production Weather Prediction Code", journal = j-TOPC, volume = "5", number = "2", pages = "7:1--7:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3291523", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/fortran3.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We introduce ``Hybrid Fortran,'' a new approach that allows a high-performance GPGPU port for structured grid Fortran codes. This technique only requires minimal changes for a CPU targeted codebase, which is a significant advancement in terms of productivity. It has been successfully applied to both dynamical core and physical processes of ASUCA, a Japanese mesoscale weather prediction model with more than 150k lines of code. By means of a minimal weather application that resembles ASUCA's code structure, Hybrid Fortran is compared to both a performance model as well as today's commonly used method, OpenACC. As a result, the Hybrid Fortran implementation is shown to deliver the same or better performance than OpenACC, and its performance agrees with the model both on CPU and GPU. In a full-scale production run, using an ASUCA grid with 1581 $ \times $ 1301 $ \times $ 58 cells and real-world weather data in 2km resolution, 24 NVIDIA Tesla P100 running the Hybrid Fortran-based GPU port are shown to replace more than fifty 18-core Intel Xeon Broadwell E5-2695 v4 running the reference implementation-an achievement comparable to more invasive GPGPU rewrites of other weather models.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Burtscher:2018:HQF, author = "Martin Burtscher and Sindhu Devale and Sahar Azimi and Jayadharini Jaiganesh and Evan Powers", title = "A High-Quality and Fast Maximal Independent Set Implementation for {GPUs}", journal = j-TOPC, volume = "5", number = "2", pages = "8:1--8:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3291525", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Computing a maximal independent set is an important step in many parallel graph algorithms. This article introduces ECL-MIS, a maximal independent set implementation that works well on GPUs. It includes key optimizations to speed up computation, reduce the memory footprint, and increase the set size. Its CUDA implementation requires fewer than 30 kernel statements, runs asynchronously, and produces a deterministic result. It outperforms the maximal independent set implementations of Pannotia, CUSP, and IrGL on each of the 16 tested graphs of various types and sizes. On a Titan X GPU, ECL-MIS is between 3.9 and 100 times faster (11.5 times, on average). ECL-MIS running on the GPU is also faster than the parallel CPU codes Ligra, Ligra+, and PBBS running on 20 Xeon cores, which it outperforms by 4.1 times, on average. At the same time, ECL-MIS produces maximal independent sets that are up to 52\% larger (over 10\%, on average) compared to these preexisting CPU and GPU implementations. Whereas these codes produce maximal independent sets that are, on average, about 15\% smaller than the largest possible such sets, ECL-MIS sets are less than 6\% smaller than the maximum independent sets.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Liu:2018:APR, author = "Junhong Liu and Guangming Tan and Yulong Luo and Jiajia Li and Zeyao Mo and Ninghui Sun", title = "An Autotuning Protocol to Rapidly Build Autotuners", journal = j-TOPC, volume = "5", number = "2", pages = "9:1--9:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3291527", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Automatic performance tuning (Autotuning) is an increasingly critical tuning technique for the high portable performance of Exascale applications. However, constructing an autotuner from scratch remains a challenge, even for domain experts. In this work, we propose a performance tuning and knowledge management suite (PAK) to help rapidly build autotuners. In order to accommodate existing autotuning techniques, we present an autotuning protocol that is composed of an extractor, producer, optimizer, evaluator, and learner. To achieve modularity and reusability, we also define programming interfaces for each protocol component as the fundamental infrastructure, which provides a customizable mechanism to deploy knowledge mining in the performance database. PAK's usability is demonstrated by studying two important computational kernels: stencil computation and sparse matrix-vector multiplication (SpMV). Our proposed autotuner based on PAK shows comparable performance and higher productivity than traditional autotuners by writing just a few tens of code using our autotuning protocol.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Anta:2018:SDP, author = "Antonio Fern{\'a}ndez Anta and Dariusz R. Kowalski and Miguel A. Mosteiro and Prudence W. H. Wong", title = "Scheduling Dynamic Parallel Workload of Mobile Devices with Access Guarantees", journal = j-TOPC, volume = "5", number = "2", pages = "10:1--10:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3291529", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "We study a dynamic resource-allocation problem that arises in various parallel computing scenarios, such as mobile cloud computing, cloud computing systems, Internet of Things systems, and others. Generically, we model the architecture as client mobile devices and static base stations. Each client ``arrives'' to the system to upload data to base stations by radio transmissions and then ``leaves.'' The problem, called Station Assignment, is to assign clients to stations so that every client uploads their data under some restrictions, including a target subset of stations, a maximum delay between transmissions, a volume of data to upload, and a maximum bandwidth for each station. We study the solvability of Station Assignment under an adversary that controls the arrival and departure of clients, limited to maximum rate and burstiness of such arrivals. We show upper and lower bounds on the rate and burstiness for various client arrival schedules and protocol classes. To the best of our knowledge, this is the first time that Station Assignment is studied under adversarial arrivals and departures.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Mirhosseini:2018:BBA, author = "Amirhossein Mirhosseini and Mohammad Sadrosadati and Fatemeh Aghamohammadi and Mehdi Modarressi and Hamid Sarbazi-Azad", title = "{BARAN}: Bimodal Adaptive Reconfigurable-Allocator Network-on-Chip", journal = j-TOPC, volume = "5", number = "3", pages = "11:1--11:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3294049", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Virtual channels are employed to improve the throughput under high traffic loads in Networks-on-Chips (NoCs). However, they can impose non-negligible overheads on performance by prolonging clock cycle time, especially under low traffic loads where the impact of virtual channels on performance is trivial. In this article, we propose a novel architecture, called BARAN, that can either improve on-chip network performance or reduce its power consumption (depending on the specific implementation chosen), not both at the same time, when virtual channels are underutilized; that is, the average number of virtual channel allocation requests per cycle is lower than the number of total virtual channels. We also introduce a reconfigurable arbitration logic within the BARAN architecture that can be configured to have multiple latencies and, hence, multiple slack times. The increased slack times are then used to reduce the supply voltage of the routers or increase their clock frequency in order to reduce power consumption or improve the performance of the whole NoC system. The power-centric design of BARAN reduces NoC power consumption by 43.4\% and 40.6\% under CMP and GPU workloads, on average, respectively, compared to a baseline architecture while imposing negligible area and performance overheads. The performance-centric design of BARAN reduces the average packet latency by 45.4\% and 42.1\%, on average, under CMP and GPU workloads, respectively, compared to the baseline architecture while increasing power consumption by 39.7\% and 43.7\%, on average. Moreover, the performance-centric BARAN postpones the network saturation rate by 11.5\% under uniform random traffic compared to the baseline architecture.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Amer:2018:LCM, author = "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and Milind Chabbi and Yanjie Wei and Jeff Hammond and Satoshi Matsuoka", title = "Lock Contention Management in Multithreaded {MPI}", journal = j-TOPC, volume = "5", number = "3", pages = "12:1--12:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3275443", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/pvm.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3275443", abstract = "In this article, we investigate contention management in lock-based thread-safe MPI libraries. Specifically, we make two assumptions: (1) locks are the only form of synchronization when protecting communication paths; and (2) contention occurs, and thus serialization is unavoidable. Our work distinguishes between lock acquisitions with respect to work being performed inside a critical section; productive vs. unproductive. Waiting for message reception without doing anything else inside a critical section is an example of unproductive lock acquisition. We show that the high-throughput nature of modern scalable locking protocols translates into better communication progress for throughput-intensive MPI communication but negatively impacts latency-sensitive communication because of overzealous unproductive lock acquisition. To reduce unproductive lock acquisitions, we devised a method that promotes threads with productive work using a generic two-level priority locking protocol. Our results show that using a high-throughput protocol for productive work and a fair protocol for less productive code paths ensures the best tradeoff for fine-grained communication, whereas a fair protocol is sufficient for more coarse-grained communication. Although these efforts have been rewarding, scalability degradation remains significant. We discuss techniques that diverge from the pure locking model and offer the potential to further improve scalability.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Chen:2018:PDG, author = "Rong Chen and Jiaxin Shi and Yanzhe Chen and Binyu Zang and Haibing Guan and Haibo Chen", title = "{PowerLyra}: Differentiated Graph Computation and Partitioning on Skewed Graphs", journal = j-TOPC, volume = "5", number = "3", pages = "13:1--13:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3298989", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Natural graphs with skewed distributions raise unique challenges to distributed graph computation and partitioning. Existing graph-parallel systems usually use a ``one-size-fits-all'' design that uniformly processes all vertices, which either suffer from notable load imbalance and high contention for high-degree vertices (e.g., Pregel and GraphLab) or incur high communication cost and memory consumption even for low-degree vertices (e.g., PowerGraph and GraphX). In this article, we argue that skewed distributions in natural graphs also necessitate differentiated processing on high-degree and low-degree vertices. We then introduce PowerLyra, a new distributed graph processing system that embraces the best of both worlds of existing graph-parallel systems. Specifically, PowerLyra uses centralized computation for low-degree vertices to avoid frequent communications and distributes the computation for high-degree vertices to balance workloads. PowerLyra further provides an efficient hybrid graph partitioning algorithm (i.e., hybrid-cut) that combines edge-cut (for low-degree vertices) and vertex-cut (for high-degree vertices) with heuristics. To improve cache locality of inter-node graph accesses, PowerLyra further provides a locality-conscious data layout optimization. PowerLyra is implemented based on the latest GraphLab and can seamlessly support various graph algorithms running in both synchronous and asynchronous execution modes. A detailed evaluation on three clusters using various graph-analytics and MLDM (Machine Learning and Data Mining) applications shows that PowerLyra outperforms PowerGraph by up to 5.53X (from 1.24X) and 3.26X (from 1.49X) for real-world and synthetic graphs, respectively, and is much faster than other systems like GraphX and Giraph, yet with much less memory consumption. A porting of hybrid-cut to GraphX further confirms the efficiency and generality of PowerLyra.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Aravind:2019:GME, author = "Alex Aravind and Wim H. Hesselink", title = "Group Mutual Exclusion by Fetch-and-increment", journal = j-TOPC, volume = "5", number = "4", pages = "14:1--14:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309202", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Mar 11 18:54:51 MDT 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3309202\&ftid=2042588\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED", abstract = "The group mutual exclusion (GME) problem (also called the room synchronization problem) arises in various practical applications that require concurrent data sharing. Group mutual exclusion aims to achieve exclusive access to a shared resource (a shared room) while facilitating concurrency among non-conflicting requests. The problem is that threads with distinct interests are not allowed to access the shared resource concurrently, but multiple threads with same interest can. In Blelloch et al. (2003), the authors presented a simple solution to the room synchronization problem using fetch8add (F8A) and test-and-set (T8S) atomic operations. This algorithm has $O(m)$ remote memory references (RMRs) in the cache coherent (CC) model, where $m$ is the number of forums. In Bhatt and Huang (2010), an open problem was posed: `` Is it possible to design a GME algorithm with constant RMR for the CC model using fetch8add instructions? '' This question is partially answered in this article by presenting a group mutual exclusion algorithm using fetch-and-increment instructions. The algorithm is simple and scalable.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Behzad:2019:OPH, author = "Babak Behzad and Surendra Byna and Prabhat and Marc Snir", title = "Optimizing {I/O} Performance of {HPC} Applications with Autotuning", journal = j-TOPC, volume = "5", number = "4", pages = "15:1--15:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309205", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Mar 11 18:54:51 MDT 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3309205\&ftid=2044223\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED", abstract = "Parallel Input output is an essential component of modern high-performance computing (HPC). Obtaining good I/O performance for a broad range of applications on diverse HPC platforms is a major challenge, in part, because of complex inter dependencies between I/O middleware and hardware. The parallel file system and I/O middleware layers all offer optimization parameters that can, in theory, result in better I/O performance. Unfortunately, the right combination of parameters is highly dependent on the application, HPC platform, problem size, and concurrency. Scientific application developers do not have the time or expertise to take on the substantial burden of identifying good parameters for each problem configuration. They resort to using system defaults, a choice that frequently results in poor I/O performance. We expect this problem to be compounded on exascale-class machines, which will likely have a deeper software stack with hierarchically arranged hardware resources. We present as a solution to this problem an autotuning system for optimizing I/O performance, I/O performance modeling, I/O tuning, and I/O patterns. We demonstrate the value of this framework across several HPC platforms and applications at scale.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Maier:2019:CHT, author = "Tobias Maier and Peter Sanders and Roman Dementiev", title = "Concurrent Hash Tables: Fast and General(?)!", journal = j-TOPC, volume = "5", number = "4", pages = "16:1--16:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309206", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Mar 11 18:54:51 MDT 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/hash.bib; http://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3309206\&ftid=2042589\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED", abstract = "Concurrent hash tables are one of the most important concurrent data structures, which are used in numerous applications. For some applications, it is common that hash table accesses dominate the execution time. To efficiently solve these problems in parallel, we need implementations that achieve speedups in highly concurrent scenarios. Unfortunately, currently available concurrent hashing libraries are far away from this requirement, in particular, when adaptively sized tables are necessary or contention on some elements occurs. Our starting point for better performing data structures is a fast and simple lock-free concurrent hash table based on linear probing that is, however, limited to word-sized key-value types and does not support dynamic size adaptation. We explain how to lift these limitations in a provably scalable way and demonstrate that dynamic growing has a performance overhead comparable to the same generalization in sequential hash tables. We perform extensive experiments comparing the performance of our implementations with six of the most widely used concurrent hash tables. Ours are considerably faster than the best algorithms with similar restrictions and an order of magnitude faster than the best more general tables. In some extreme cases, the difference even approaches four orders of magnitude. All our implementations discussed in this publication can be found on github [17].", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Cruz:2019:ETM, author = "Eduardo H. M. Cruz and Matthias Diener and La{\'e}rcio L. Pilla and Philippe O. A. Navaux", title = "{EagerMap}: a Task Mapping Algorithm to Improve Communication and Load Balancing in Clusters of Multicore Systems", journal = j-TOPC, volume = "5", number = "4", pages = "17:1--17:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309711", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Mar 11 18:54:51 MDT 2019", bibsource = "http://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3309711\&ftid=2044225\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED", abstract = "Communication between tasks and load imbalance have been identified as a major challenge for the performance and energy efficiency of parallel applications. A common way to improve communication is to increase its locality, that is, to reduce the distances of data transfers, prioritizing the usage of faster and more efficient local interconnections over remote ones. Regarding load imbalance, cores should execute a similar amount of work. An important problem to be solved in this context is how to determine an optimized mapping of tasks to cluster nodes and cores that increases the overall locality and load balancing. In this article, we propose the EagerMap algorithm to determine task mappings, which is based on a greedy heuristic to match application communication patterns to hardware hierarchies and which can also consider the task load. Compared to previous algorithms, EagerMap is faster, scales better, and supports more types of computer systems, while maintaining the same or better quality of the determined task mapping. EagerMap is therefore an interesting choice for task mapping on a variety of modern parallel architectures.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", }