Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.12",
%%%     date            = "11 March 2019",
%%%     time            = "18:58:03 MST",
%%%     filename        = "topc.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "00952 5174 29538 275848",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Parallel Computing
%%%                       (TOPC); bibliography; BibTeX",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Parallel Computing (TOPC)
%%%                        (CODEN ????, ISSN 2329-4949 (print),
%%%                        2329-4957 (electronic)).  The journal appears
%%%                        quarterly, and publication began with volume
%%%                        1, number 1, in September 2014.
%%%
%%%                        At version 1.12, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2014 (   8)    2016 (  25)    2018 (  22)
%%%                             2015 (  29)    2017 (  16)    2019 (   4)
%%%
%%%                             Article:        104
%%%
%%%                             Total entries:  104
%%%
%%%                        The journal Web page can be found at:
%%%
%%%                            http://topc.acm.org/
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://dl.acm.org/pub.cfm?id=J1496
%%%                            http://dl.acm.org/citation.cfm?id=2632163
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-TOPC                  = "ACM Transactions on Parallel Computing
                                  (TOPC)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Gibbons:2014:ATP,
  author =       "Phillip B. Gibbons",
  title =        "{ACM Transactions on Parallel Computing}: an
                 introduction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "1:1--1:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661651",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Techniques",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Lilja:2014:I,
  author =       "David J. Lilja",
  title =        "Introduction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2609798",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Rane:2014:EPO,
  author =       "Ashay Rane and James Browne",
  title =        "Enhancing Performance Optimization of Multicore\slash
                 Multichip Nodes with Data Structure Metrics",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "3:1--3:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2588788",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Program performance optimization is usually based
                 solely on measurements of execution behavior of code
                 segments using hardware performance counters. However,
                 memory access patterns are critical performance
                 limiting factors for today's multicore chips where
                 performance is highly memory bound. Therefore diagnoses
                 and selection of optimizations based only on
                 measurements of the execution behavior of code segments
                 are incomplete because they do not incorporate
                 knowledge of memory access patterns and behaviors. This
                 article presents a low-overhead tool (MACPO) that
                 captures memory traces and computes metrics for the
                 memory access behavior of source-level (C, C++,
                 Fortran) data structures. MACPO explicitly targets the
                 measurement and metrics important to performance
                 optimization for multicore chips. The article also
                 presents a complete process for integrating measurement
                 and analyses of code execution with measurements and
                 analyses of memory access patterns and behaviors for
                 performance optimization, specifically targeting
                 multicore chips and multichip nodes of clusters. MACPO
                 uses more realistic cache models for computation of
                 latency metrics than those used by previous tools.
                 Evaluation of the effectiveness of adding memory access
                 behavior characteristics of data structures to
                 performance optimization was done on subsets of the
                 ASCI, NAS and Rodinia parallel benchmarks and two
                 versions of one application program from a domain not
                 represented in these benchmarks. Adding characteristics
                 of the behavior of data structures enabled easier
                 diagnoses of bottlenecks and more accurate selection of
                 appropriate optimizations than with only code centric
                 behavior measurements. The performance gains ranged
                 from a few percent to 38 percent.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jimenez:2014:APP,
  author =       "V{\'\i}ctor Jim{\'e}nez and Francisco J. Cazorla and
                 Roberto Gioiosa and Alper Buyuktosunoglu and Pradip
                 Bose and Francis P. O'Connell and Bruce G. Mealey",
  title =        "Adaptive Prefetching on {POWER7}: Improving
                 Performance and Power Consumption",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "4:1--4:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2588889",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Hardware data prefetch engines are integral parts of
                 many general purpose server-class microprocessors in
                 the field today. Some prefetch engines allow users to
                 change some of their parameters. But, the prefetcher is
                 usually enabled in a default configuration during
                 system bring-up, and dynamic reconfiguration of the
                 prefetch engine is not an autonomic feature of current
                 machines. Conceptually, however, it is easy to infer
                 that commonly used prefetch algorithms---when applied
                 in a fixed mode---will not help performance in many
                 cases. In fact, they may actually degrade performance
                 due to useless bus bandwidth consumption and cache
                 pollution, which in turn, will also waste power. We
                 present an adaptive prefetch scheme that dynamically
                 modifies the prefetch settings in order to adapt to
                 workloads' requirements. We use a commercial processor,
                 namely the IBM POWER7 as a vehicle for our study. First
                 we characterize---in terms of performance and power
                 consumption---the prefetcher in that processor using
                 microbenchmarks and SPEC CPU2006. We then present our
                 adaptive prefetch mechanism showing performance
                 improvements with respect to the default prefetch
                 setting up to 2.7X and 1.3X for single-threaded and
                 multiprogrammed workloads, respectively. Adaptive
                 prefetching is also able to reduce power consumption in
                 some cases. Finally, we also evaluate our mechanism
                 with SPECjbb2005, improving both performance and power
                 consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Heil:2014:APH,
  author =       "Timothy Heil and Anil Krishna and Nicholas Lindberg
                 and Farnaz Toussi and Steven Vanderwiel",
  title =        "Architecture and Performance of the Hardware
                 Accelerators in {IBM}'s {PowerEN} Processor",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2588888",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Computation at the edge of a datacenter has unique
                 characteristics. It deals with streaming data from
                 multiple sources, going to multiple destinations, often
                 requiring repeated application of one or more of
                 several standard algorithmic kernels. These kernels,
                 related to encryption, compression, XML Parsing and
                 regular expression searching on the data, demand a high
                 data processing rate and power efficiency. This
                 suggests the use of hardware acceleration for key
                 functions. However, robust general purpose processing
                 support is necessary to orchestrate the flow of data
                 between accelerators, as well as perform tasks that are
                 not suited to acceleration. Further, these accelerators
                 must be tightly integrated with the general purpose
                 computation in order to keep invocation overhead and
                 latency low. The accelerators must be easy for software
                 to use, and the system must be flexible enough to
                 support evolving networking standards. In this article,
                 we describe and evaluate the architecture of IBM's
                 PowerEN processor, with a focus on PowerEN's
                 architectural enhancements and its on-chip hardware
                 accelerators. PowerEN unites the throughput of
                 application-specific accelerators with the
                 programmability of general purpose cores on a single
                 coherent memory architecture. Hardware acceleration
                 improves throughput by orders of magnitude in some
                 cases compared to equivalent computation on the general
                 purpose cores. By offloading work to the accelerators,
                 general purpose cores are freed to simultaneously work
                 on computation less suited to acceleration.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Wu:2014:MAG,
  author =       "Xing Wu and Frank Mueller and Scott Pakin",
  title =        "A methodology for automatic generation of executable
                 communication specifications from parallel {MPI}
                 applications",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660249",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Portable parallel benchmarks are widely used for
                 performance evaluation of HPC systems. However, because
                 these are manually produced, they generally represent a
                 greatly simplified view of application behavior,
                 missing the subtle but important-to-performance nuances
                 that may exist in a complete application. This work
                 contributes novel methods to automatically generate
                 highly portable and customizable communication
                 benchmarks from HPC applications. We utilize
                 ScalaTrace, a lossless yet scalable
                 parallel-application tracing framework to collect
                 selected aspects of the run-time behavior of HPC
                 applications, including communication operations and
                 computation time, while abstracting away the details of
                 the computation proper. We subsequently generate
                 benchmarks with nearly identical run-time behavior to
                 the original applications. Results demonstrate that the
                 generated benchmarks are in fact able to preserve the
                 run-time behavior (including both the communication
                 pattern and the execution time) of the original
                 applications. Such automated benchmark generation is
                 without precedent and particularly valuable for
                 proprietary, export-controlled, or classified
                 application codes.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ravishankar:2014:APC,
  author =       "Mahesh Ravishankar and John Eisenlohr and
                 Louis-No{\"e}l Pouchet and J. Ramanujam and Atanas
                 Rountev and P. Sadayappan",
  title =        "Automatic parallelization of a class of irregular
                 loops for distributed memory systems",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660251",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Many scientific applications spend significant time
                 within loops that are parallel, except for dependences
                 from associative reduction operations. However these
                 loops often contain data-dependent control-flow and
                 array-access patterns. Traditional optimizations that
                 rely on purely static analysis fail to generate
                 parallel code in such cases. This article proposes an
                 approach for automatic parallelization for distributed
                 memory environments, using both static and runtime
                 analysis. We formalize the computations that are
                 targeted by this approach and develop algorithms to
                 detect such computations. We also describe algorithms
                 to generate a parallel inspector that performs a
                 runtime analysis of control-flow and array-access
                 patterns, and a parallel executor to take advantage of
                 this information. The effectiveness of the approach is
                 demonstrated on several benchmarks that were
                 automatically transformed using a prototype compiler.
                 For these, the inspector overheads and performance of
                 the executor code were measured. The benefit on
                 real-world applications was also demonstrated through
                 similar manual transformations of an atmospheric
                 modeling software.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Shun:2014:SPC,
  author =       "Julian Shun and Guy E. Blelloch",
  title =        "A simple parallel {Cartesian} tree algorithm and its
                 application to parallel suffix tree construction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661653",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present a simple linear work and space, and
                 polylogarithmic time parallel algorithm for generating
                 multiway Cartesian trees. We show that bottom-up
                 traversals of the multiway Cartesian tree on the
                 interleaved suffix array and longest common prefix
                 array of a string can be used to answer certain string
                 queries. By adding downward pointers in the tree (e.g.
                 using a hash table), we can also generate suffix trees
                 from suffix arrays on arbitrary alphabets in the same
                 bounds. In conjunction with parallel suffix array
                 algorithms, such as the skew algorithm, this gives a
                 rather simple linear work parallel, $O(n \epsilon) $
                 time $ (0 < \epsilon < 1) $, algorithm for generating
                 suffix trees over an integer alphabet $ \Sigma \{ 1,
                 \ldots {}, n \} $, where $n$ is the length of the input
                 string. It also gives a linear work parallel algorithm
                 requiring $O(\log_2 n) $ time with high probability
                 for constant-sized alphabets. More generally, given a
                 sorted sequence of strings and the longest common
                 prefix lengths between adjacent elements, the algorithm
                 will generate a patricia tree (compacted trie) over the
                 strings. Of independent interest, we describe a
                 work-efficient parallel algorithm for solving the all
                 nearest smaller values problem using Cartesian trees,
                 which is much simpler than the work-efficient parallel
                 algorithm described in previous work. We also present
                 experimental results comparing the performance of the
                 algorithm to existing sequential implementations and a
                 second parallel algorithm that we implement. We present
                 comparisons for the Cartesian tree algorithm on its own
                 and for constructing a suffix tree. The results show
                 that on a variety of strings our algorithm is
                 competitive with the sequential version on a single
                 processor and achieves good speedup on multiple
                 processors. We present experiments for three
                 applications that require only the Cartesian tree, and
                 also for searching using the suffix tree.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Pingali:2015:ISI,
  author =       "Keshav Pingali and J. Ramanujam and P. Sadayappan",
  title =        "Introduction to the Special Issue on {PPoPP'12}",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2716343",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bouteiller:2015:ABF,
  author =       "Aurelien Bouteiller and Thomas Herault and George
                 Bosilca and Peng Du and Jack Dongarra",
  title =        "Algorithm-Based Fault Tolerance for Dense Matrix
                 Factorizations, Multiple Failures and Accuracy",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686892",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Dense matrix factorizations, such as LU, Cholesky and
                 QR, are widely used for scientific applications that
                 require solving systems of linear equations,
                 eigenvalues and linear least squares problems. Such
                 computations are normally carried out on
                 supercomputers, whose ever-growing scale induces a fast
                 decline of the Mean Time To Failure (MTTF). This
                 article proposes a new hybrid approach, based on
                 Algorithm-Based Fault Tolerance (ABFT), to help matrix
                 factorizations algorithms survive fail-stop failures.
                 We consider extreme conditions, such as the absence of
                 any reliable node and the possibility of losing both
                 data and checksum from a single failure. We will
                 present a generic solution for protecting the right
                 factor, where the updates are applied, of all above
                 mentioned factorizations. For the left factor, where
                 the panel has been applied, we propose a scalable
                 checkpointing algorithm. This algorithm features high
                 degree of checkpointing parallelism and cooperatively
                 utilizes the checksum storage leftover from the right
                 factor protection. The fault-tolerant algorithms
                 derived from this hybrid solution is applicable to a
                 wide range of dense matrix factorizations, with minor
                 modifications. Theoretical analysis shows that the
                 fault tolerance overhead decreases inversely to the
                 scaling in the number of computing units and the
                 problem size. Experimental results of LU and QR
                 factorization on the Kraken (Cray XT5) supercomputer
                 validate the theoretical evaluation and confirm
                 negligible overhead, with- and without-errors.
                 Applicability to tolerate multiple failures and
                 accuracy after multiple recovery is also considered.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2015:ACS,
  author =       "Grey Ballard and James Demmel and Nicholas Knight",
  title =        "Avoiding Communication in Successive Band Reduction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686877",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The running time of an algorithm depends on both
                 arithmetic and communication (i.e., data movement)
                 costs, and the relative costs of communication are
                 growing over time. In this work, we present sequential
                 and distributed-memory parallel algorithms for
                 tridiagonalizing full symmetric and symmetric band
                 matrices that asymptotically reduce communication
                 compared to previous approaches. The tridiagonalization
                 of a symmetric band matrix is a key kernel in solving
                 the symmetric eigenvalue problem for both full and band
                 matrices. In order to preserve structure,
                 tridiagonalization routines use annihilate-and-chase
                 procedures that previously have suffered from poor data
                 locality and high parallel latency cost. We improve
                 both by reorganizing the computation and obtain
                 asymptotic improvements. We also propose new algorithms
                 for reducing a full symmetric matrix to band form in a
                 communication-efficient manner. In this article, we
                 consider the cases of computing eigenvalues only and of
                 computing eigenvalues and all eigenvectors.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sack:2015:CAM,
  author =       "Paul Sack and William Gropp",
  title =        "Collective Algorithms for Multiported Torus Networks",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686882",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Modern supercomputers with torus networks allow each
                 node to simultaneously pass messages on all of its
                 links. However, most collective algorithms are designed
                 to only use one link at a time. In this work, we
                 present novel multiported algorithms for the scatter,
                 gather, all-gather, and reduce-scatter operations. Our
                 algorithms can be combined to create multiported
                 reduce, all-reduce, and broadcast algorithms. Several
                 of these algorithms involve a new technique where we
                 relax the MPI message-ordering constraints to achieve
                 high performance and restore the correct ordering using
                 an additional stage of redundant communication.
                 According to our models, on an $n$-dimensional torus,
                 our algorithms should allow for nearly a $2 n$-fold
                 improvement in communication performance compared to
                 known, single-ported torus algorithms. In practice, we
                 have achieved nearly $6\times$ better performance on a
                 32k-node 3-dimensional torus.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dice:2015:LCG,
  author =       "David Dice and Virendra J. Marathe and Nir Shavit",
  title =        "Lock Cohorting: a General Technique for Designing
                 {NUMA} Locks",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686884",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Multicore machines are quickly shifting to NUMA and
                 CC-NUMA architectures, making scalable NUMA-aware
                 locking algorithms, ones that take into account the
                 machine's nonuniform memory and caching hierarchy, ever
                 more important. This article presents lock cohorting, a
                 general new technique for designing NUMA-aware locks
                 that is as simple as it is powerful. Lock cohorting
                 allows one to transform any spin-lock algorithm, with
                 minimal nonintrusive changes,into a scalable NUMA-aware
                 spin-lock. Our new cohorting technique allows us to
                 easily create NUMA-aware versions of the TATAS-Backoff,
                 CLH, MCS, and ticket locks, to name a few. Moreover, it
                 allows us to derive a CLH-based cohort abortable lock,
                 the first NUMA-aware queue lock to support
                 abortability. We empirically compared the performance
                 of cohort locks with prior NUMA-aware and classic
                 NUMA-oblivious locks on a synthetic micro-benchmark, a
                 real world key-value store application memcached, as
                 well as the libc memory allocator. Our results
                 demonstrate that cohort locks perform as well or better
                 than known locks when the load is low and significantly
                 out-perform them as the load increases.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Merrill:2015:HPS,
  author =       "Duane Merrill and Michael Garland and Andrew
                 Grimshaw",
  title =        "High-Performance and Scalable {GPU} Graph Traversal",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2717511",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Breadth-First Search (BFS) is a core primitive for
                 graph traversal and a basis for many higher-level graph
                 analysis algorithms. It is also representative of a
                 class of parallel computations whose memory accesses
                 and work distribution are both irregular and data
                 dependent. Recent work has demonstrated the
                 plausibility of GPU sparse graph traversal, but has
                 tended to focus on asymptotically inefficient
                 algorithms that perform poorly on graphs with
                 nontrivial diameter. We present a BFS parallelization
                 focused on fine-grained task management constructed
                 from efficient prefix sum computations that achieves an
                 asymptotically optimal O(|V| + |E|) gd work complexity.
                 Our implementation delivers excellent performance on
                 diverse graphs, achieving traversal rates in excess of
                 3.3 billion and 8.3 billion traversed edges per second
                 using single- and quad-GPU configurations,
                 respectively. This level of performance is several
                 times faster than state-of-the-art implementations on
                 both CPU and GPU platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kramer:2015:SET,
  author =       "Stephan C. Kramer and Johannes Hagemann",
  title =        "{SciPAL}: Expression Templates and Composition Closure
                 Objects for High Performance Computational Physics with
                 {CUDA} and {OpenMP}",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686886",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present SciPAL (scientific parallel algorithms
                 library), a C ++-based, hardware-independent
                 open-source library. Its core is a domain-specific
                 embedded language for numerical linear algebra. The
                 main fields of application are finite element
                 simulations, coherent optics and the solution of
                 inverse problems. Using SciPAL algorithms can be stated
                 in a mathematically intuitive way in terms of matrix
                 and vector operations. Existing algorithms can easily
                 be adapted to GPU-based computing by proper template
                 specialization. Our library is compatible with the
                 finite element library deal .II and provides a port of
                 deal.II's most frequently used linear algebra classes
                 to CUDA (NVidia's extension of the programming
                 languages C and C ++ for programming their GPUs).
                 SciPAL 's operator-based API for BLAS operations
                 particularly aims at simplifying the usage of NVidia's
                 CUBLAS. For non-BLAS array arithmetic SciPAL 's
                 expression templates are able to generate CUDA kernels
                 at compile time. We demonstrate the benefits of SciPAL
                 using the iterative principal component analysis as
                 example which is the core algorithm for the
                 spike-sorting problem in neuroscience.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Totoni:2015:PME,
  author =       "Ehsan Totoni and Nikhil Jain and Laxmikant V. Kale",
  title =        "Power Management of Extreme-Scale Networks with
                 On\slash Off Links in Runtime Systems",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687001",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Networks are among major power consumers in
                 large-scale parallel systems. During execution of
                 common parallel applications, a sizeable fraction of
                 the links in the high-radix interconnects are either
                 never used or are underutilized. We propose a runtime
                 system based adaptive approach to turn off unused
                 links, which has various advantages over the previously
                 proposed hardware and compiler based approaches. We
                 discuss why the runtime system is the best system
                 component to accomplish this task, and test the
                 effectiveness of our approach using real applications
                 (including NAMD, MILC), and application benchmarks
                 (including NAS Parallel Benchmarks, Stencil). These
                 codes are simulated on representative topologies such
                 as 6-D Torus and multilevel directly connected network
                 (similar to IBM PERCS in Power 775 and Dragonfly in
                 Cray Aries). For common applications with near-neighbor
                 communication pattern, our approach can save up to 20\%
                 of total machine's power and energy, without any
                 performance penalty.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Herlihy:2015:GEI,
  author =       "Maurice Herlihy",
  title =        "{Guest Editor} Introduction",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2716306",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Degener:2015:LCS,
  author =       "Bastian Degener and Barbara Kempkes and Peter Kling
                 and Friedhelm {Meyer Auf Der Heide}",
  title =        "Linear and Competitive Strategies for Continuous Robot
                 Formation Problems",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742341",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study a scenario in which n mobile robots with a
                 limited viewing range are distributed in the Euclidean
                 plane and have to solve a formation problem. The
                 formation problems we consider are the Gathering
                 problem and the Chain-Formation problem. In the
                 Gathering problem, the robots have to gather in one
                 (not predefined) point, while in the Chain-Formation
                 problem they have to form a connected communication
                 chain of minimal length between two stationary base
                 stations. Each robot may base its decisions where to
                 move only on the current relative positions of
                 neighboring robots (that are within its viewing range);
                 that is, besides having a limited viewing range, the
                 robots are oblivious (they do not use information from
                 the past), have none or only very limited identities,
                 and they do not have a common sense of direction.
                 Variants of these problems (especially for the
                 Gathering problem) have been studied extensively in
                 different discrete time models. In contrast, our work
                 focuses on a continuous time model; that is, the robots
                 continuously sense the positions of other robots within
                 their viewing range and continuously adapt their speed
                 and direction according to some simple, local rules.
                 Hereby, we assume that the robots have a maximum
                 movement speed of one. We show that this idealized idea
                 of continuous sensing allows us to solve the mentioned
                 formation problems in linear time $ O(n) $ (which,
                 given the maximum speed of one, immediately yields a
                 maximum traveled distance of $ O(n)$). Note that in the
                 more classical discrete time models, the best known
                 strategies need at least $ \Theta (n^2)$ or even $
                 \Theta (n^2 \log n)$ timesteps to solve these problems.
                 For the Gathering problem, our analysis solves a
                 problem left open by Gordon et al. [2004], where the
                 authors could prove that gathering in a continuous
                 model is possible in finite time, but were not able to
                 give runtime bounds. Apart from these linear bounds, we
                 also provide runtime bounds for both formation problems
                 that relate the runtime of our strategies to the
                 runtime of an optimal, global algorithm. Specifically,
                 we show that our strategy for the Gathering problem is
                 log OPT-competitive and the strategy for the
                 Chain-Formation problem is $ \log n$ competitive. Here,
                 by $c$-competitive, we mean that our (local) strategy
                 is asymptotically by at most a factor of $c$ slower
                 than an optimal, global strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jain:2015:NOS,
  author =       "Navendu Jain and Ishai Menache and Joseph (Seffi) Naor
                 and Jonathan Yaniv",
  title =        "Near-Optimal Scheduling Mechanisms for
                 Deadline-Sensitive Jobs in Large Computing Clusters",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742343",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We consider a market-based resource allocation model
                 for batch jobs in cloud computing clusters. In our
                 model, we incorporate the importance of the due date of
                 a job rather than the number of servers allocated to it
                 at any given time. Each batch job is characterized by
                 the work volume of total computing units (e.g., CPU
                 hours) along with a bound on maximum degree of
                 parallelism. Users specify, along with these job
                 characteristics, their desired due date and a value for
                 finishing the job by its deadline. Given this
                 specification, the primary goal is to determine the
                 scheduling of cloud computing instances under capacity
                 constraints in order to maximize the social welfare
                 (i.e., sum of values gained by allocated users). Our
                 main result is a new (CC-kcss-1)-approximation
                 algorithm for this objective, where $C$ denotes cloud
                 capacity, $k$ is the maximal bound on parallelized
                 execution (in practical settings, $ k < C$) and $s$ is
                 the slackness on the job completion time, that is, the
                 minimal ratio between a specified deadline and the
                 earliest finish time of a job. Our algorithm is based
                 on utilizing dual fitting arguments over a strengthened
                 linear program to the problem. Based on the new
                 approximation algorithm, we construct truthful
                 allocation and pricing mechanisms, in which reporting
                 the true value and other properties of the job
                 (deadline, work volume, and the parallelism bound) is a
                 dominant strategy for all users. To that end, we extend
                 known results for single-value settings to provide a
                 general framework for transforming allocation
                 algorithms into truthful mechanisms in domains of
                 single-value and multi-properties. We then show that
                 the basic mechanism can be extended under proper
                 Bayesian assumptions to the objective of maximizing
                 revenues, which is important for public clouds. We
                 empirically evaluate the benefits of our approach
                 through simulations on data-center job traces, and show
                 that the revenues obtained under our mechanism are
                 comparable with an ideal fixed-price mechanism, which
                 sets an on-demand price using oracle knowledge of
                 users' valuations. Finally, we discuss how our model
                 can be extended to accommodate uncertainties in job
                 work volumes, which is a practical challenge in cloud
                 settings.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Feldman:2015:HCG,
  author =       "Moran Feldman and Liane Lewin-Eytan and Joseph (Seffi)
                 Naor",
  title =        "Hedonic Clustering Games",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742345",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Clustering, the partitioning of objects with respect
                 to a similarity measure, has been extensively studied
                 as a global optimization problem. We investigate
                 clustering from a game-theoretic approach, and consider
                 the class of hedonic clustering games. Here, a
                 self-organized clustering is obtained via decisions
                 made by independent players, corresponding to the
                 elements clustered. Being a hedonic setting, the
                 utility of each player is determined by the identity of
                 the other members of her cluster. This class of games
                 seems to be quite robust, as it fits with rather
                 different, yet commonly used, clustering criteria.
                 Specifically, we investigate hedonic clustering games
                 in two different models: fixed clustering, which
                 subdivides into $k$-median and $k$-center, and
                 correlation clustering. We provide a thorough analysis
                 of these games, characterizing Nash equilibria, and
                 proving upper and lower bounds on the price of anarchy
                 and price of stability. For fixed clustering we focus
                 on the existence of a Nash equilibrium, as it is a
                 rather nontrivial issue in this setting. We study it
                 both for general metrics and special cases, such as
                 line and tree metrics. In the correlation clustering
                 model, we study both minimization and maximization
                 variants, and provide almost tight bounds on both the
                 price of anarchy and price of stability.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jahn:2015:RRA,
  author =       "Janmartin Jahn and Santiago Pagani and Sebastian Kobbe
                 and Jian-Jia Chen and J{\"o}rg Henkel",
  title =        "Runtime Resource Allocation for Software Pipelines",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742347",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Efficiently allocating the computational resources of
                 many-core systems is one of the most prominent
                 challenges, especially when resource requirements may
                 vary unpredictably at runtime. This is even more
                 challenging when facing unreliable cores --- a scenario
                 that becomes common as the number of cores increases
                 and integration sizes shrink. To address this
                 challenge, this article presents an optimal method for
                 the allocation of the resources to software-pipelined
                 applications. Here we show how runtime observations of
                 the resource requirements of tasks can be used to adapt
                 resource allocations. Furthermore, we show how the
                 optimum can be traded for a high degree of scalability
                 by clustering applications in a distributed,
                 hierarchical manner. To diminish the negative effects
                 of unreliable cores, this article shows how
                 self-organization can effectively restore the integrity
                 of such a hierarchy when it is corrupted by a failing
                 core. Experiments on Intel's 48-core Single-Chip Cloud
                 Computer and in a many-core simulator show that a
                 significant improvement in system throughput can be
                 achieved over the current state of the art.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Xu:2015:SVC,
  author =       "Yi Xu and Bo Zhao and Youtao Zhang and Jun Yang",
  title =        "Simple Virtual Channel Allocation for High-Throughput
                 and High-Frequency On-Chip Routers",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742349",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Packet-switched network-on-chip (NoC) has provided a
                 scalable solution to the communications for tiled
                 multicore processors. However, the virtual channel (VC)
                 buffers in the NoC consume significant dynamic and
                 leakage power. To improve the energy efficiency of the
                 router design, it is advantageous to use small buffer
                 sizes while still maintaining throughput of the
                 network. This article proposes two new virtual channel
                 allocation (VA) mechanisms, termed fixed VC assignment
                 with dynamic VC allocation (FVADA) and adjustable VC
                 assignment with dynamic VC allocation (AVADA). VCs are
                 designated to output ports and allocated to packets
                 according to such assignment. This can help to reduce
                 the head-of-line blocking. Such VC-output port
                 assignment can also be adjusted dynamically to
                 accommodate traffic changes. Simulation results show
                 that both mechanisms can improve network throughput by
                 41\% on average. Real traffic evaluation shows a
                 network latency reduction of up to 66\%. In addition,
                 AVADA can outperform the baseline in throughput with
                 only half of the buffer size. Finally, we are able to
                 achieve comparable or better throughput than a previous
                 dynamic VC allocator while reducing its critical path
                 delay by 57\%. Hence, the proposed VA mechanisms are
                 suitable for low-power, high-throughput, and
                 high-frequency NoC designs.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Hammouda:2015:NTE,
  author =       "Adam Hammouda and Andrew R. Siegel and Stephen F.
                 Siegel",
  title =        "Noise-Tolerant Explicit Stencil Computations for
                 Nonuniform Process Execution Rates",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742351",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Next-generation HPC computing platforms are likely to
                 be characterized by significant, unpredictable
                 nonuniformities in execution time among compute nodes
                 and cores. The resulting load imbalances from this
                 nonuniformity are expected to arise from a variety of
                 sources-manufacturing discrepancies, dynamic power
                 management, runtime component failure, OS jitter,
                 software-mediated resiliency, and TLB/- cache
                 performance variations, for example. It is well
                 understood that existing algorithms with frequent
                 points of bulk synchronization will perform relatively
                 poorly in the presence of these sources of process
                 nonuniformity. Thus, recasting classic bulk synchronous
                 algorithms into more asynchronous, coarse-grained
                 parallelism is a critical area of research for
                 next-generation computing. We propose a class of
                 parallel algorithms for explicit stencil computations
                 that can tolerate these nonuniformities by decoupling
                 per process communication and computation in order for
                 each process to progress asynchronously while
                 maintaining solution correctness. These algorithms are
                 benchmarked with a $1$D domain decomposed (``slabbed'')
                 implementation of the $2$D heat equation as a model
                 problem, and are tested in the presence of simulated
                 nonuniform process execution rates. The resulting
                 performance is compared to a classic bulk synchronous
                 implementation of the model problem. Results show that
                 the runtime of this article's algorithm on a machine
                 with simulated process nonuniformities is 5--99\%
                 slower than the runtime of its classic counterpart on a
                 machine free of nonuniformities. However, when both
                 algorithms are run on a machine with comparable
                 synthetic process nonuniformities, this article's
                 algorithm is 1--37 times faster than its classic
                 counterpart.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{McCreesh:2015:SST,
  author =       "Ciaran McCreesh and Patrick Prosser",
  title =        "The Shape of the Search Tree for the Maximum Clique
                 Problem and the Implications for Parallel Branch and
                 Bound",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742359",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Finding a maximum clique in a given graph is one of
                 the fundamental NP-hard problems. We compare two
                 multicore thread-parallel adaptations of a
                 state-of-the-art branch-and-bound algorithm for the
                 maximum clique problem and provide a novel explanation
                 as to why they are successful. We show that load
                 balance is sometimes a problem but that the interaction
                 of parallel search order and the most likely location
                 of solutions within the search space is often the
                 dominating consideration. We use this explanation to
                 propose a new low-overhead, scalable work-splitting
                 mechanism. Our approach uses explicit early diversity
                 to avoid strong commitment to the weakest heuristic
                 advice and late resplitting for balance. More
                 generally, we argue that, for branch-and-bound,
                 parallel algorithm design should not be performed
                 independently of the underlying sequential algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Hoefler:2015:RMA,
  author =       "Torsten Hoefler and James Dinan and Rajeev Thakur and
                 Brian Barrett and Pavan Balaji and William Gropp and
                 Keith Underwood",
  title =        "Remote Memory Access Programming in {MPI-3}",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2780584",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The Message Passing Interface (MPI) 3.0 standard,
                 introduced in September 2012, includes a significant
                 update to the one-sided communication interface, also
                 known as remote memory access (RMA). In particular, the
                 interface has been extended to better support popular
                 one-sided and global-address-space parallel programming
                 models to provide better access to hardware performance
                 features and enable new data-access modes. We present
                 the new RMA interface and specify formal axiomatic
                 models for data consistency and access semantics. Such
                 models can help users reason about details of the
                 semantics that are hard to extract from the English
                 prose in the standard. It also fosters the development
                 of tools and compilers, enabling them to automatically
                 analyze, optimize, and debug RMA programs.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Maldonado:2015:STB,
  author =       "Walther Maldonado and Patrick Marlier and Pascal
                 Felber and Julia Lawall and Gilles Muller and Etienne
                 Rivi{\`e}re",
  title =        "Supporting Time-Based {QoS} Requirements in Software
                 Transactional Memory",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2779621",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Software transactional memory (STM) is an optimistic
                 concurrency control mechanism that simplifies parallel
                 programming. However, there has been little interest in
                 its applicability to reactive applications in which
                 there is a required response time for certain
                 operations. We propose supporting such applications by
                 allowing programmers to associate time with atomic
                 blocks in the form of deadlines and quality-of-service
                 (QoS) requirements. Based on statistics of past
                 executions, we adjust the execution mode of
                 transactions by decreasing the level of optimism as the
                 deadline approaches. In the presence of concurrent
                 deadlines, we propose different conflict resolution
                 policies. Execution mode switching mechanisms allow the
                 meeting of multiple deadlines in a consistent manner,
                 with potential QoS degradations being split fairly
                 among several threads as contention increases, and
                 avoiding starvation. Our implementation consists of
                 extensions to an STM runtime that allow gathering
                 statistics and switching execution modes. We also
                 propose novel contention managers adapted to
                 transactional workloads subject to deadlines. The
                 experimental evaluation shows that our approaches
                 significantly improve the likelihood of a transaction
                 meeting its deadline and QoS requirement, even in cases
                 where progress is hampered by conflicts and other
                 concurrent transactions with deadlines.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kestor:2015:TPD,
  author =       "Gokcen Kestor and Osman S. Unsal and Adrian Cristal
                 and Serdar Tasiran",
  title =        "{TRADE}: Precise Dynamic Race Detection for Scalable
                 Transactional Memory Systems",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786021",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As other multithreaded programs, transactional memory
                 (TM) programs are prone to race conditions. Previous
                 work focuses on extending existing definitions of data
                 race for lock-based applications to TM applications,
                 which requires all transactions to be totally ordered
                 ``as if'' serialized by a global lock. This approach
                 poses implementation constraints on the STM that
                 severely limits TM applications' performance. This
                 article shows that forcing total ordering among all
                 running transactions, while sufficient, is not
                 necessary. We introduce an alternative data race
                 definition, relaxed transactional data race, that
                 requires ordering of only conflicting transactions. The
                 advantages of our relaxed definition are twofold:
                 First, unlike the previous definition, this definition
                 can be applied to a wide range of TMs, including those
                 that do not enforce transaction total ordering. Second,
                 within a single execution, it exposes a higher number
                 of data races, which considerably reduces debugging
                 time. Based on this definition, we propose a novel and
                 precise race detection tool for C/C++ TM applications
                 (TRADE), which detects data races by tracking
                 happens-before edges among conflicting transactions.
                 Our experiments reveal that TRADE precisely detects
                 data races for STAMP applications running on modern
                 STMs with overhead comparable to state-of-the-art race
                 detectors for lock-based applications. Our experiments
                 also show that in a single run, TRADE identifies
                 several races not discovered by 10 separate runs of a
                 race detection tool based on the previous data race
                 definition.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Diegues:2015:TWE,
  author =       "Nuno Diegues and Paolo Romano",
  title =        "{Time-Warp}: Efficient Abort Reduction in
                 Transactional Memory",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2775435",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The multicore revolution that took place one decade
                 ago has turned parallel programming into a major
                 concern for the mainstream software development
                 industry. In this context, Transactional Memory (TM)
                 has emerged as a simpler and attractive alternative to
                 that of lock-based synchronization, whose complexity
                 and error-proneness are widely recognized. The notion
                 of permissiveness in TM translates to only aborting a
                 transaction when it cannot be accepted in any history
                 that guarantees a target correctness criterion. This
                 theoretically powerful property is often neglected by
                 state-of-the-art TMs because it imposes considerable
                 algorithmic costs. Instead, these TMs opt to maximize
                 their implementation's efficiency by aborting
                 transactions under overly conservative conditions. As a
                 result, they risk rejecting a significant number of
                 safe executions. In this article, we seek to identify a
                 sweet spot between permissiveness and efficiency by
                 introducing the Time-Warp Multiversion (TWM) algorithm.
                 TWM is based on the key idea of allowing an update
                 transaction that has performed stale reads (i.e.,
                 missed the writes of concurrently committed
                 transactions) to be serialized by ``committing it in
                 the past,'' which we call a time-warp commit. At its
                 core, TWM uses a novel, lightweight validation
                 mechanism with little computational overhead. TWM also
                 guarantees that read-only transactions can never be
                 aborted. Further, TWM guarantees Virtual World
                 Consistency, a safety property that is deemed as
                 particularly relevant in the context of TM. We
                 demonstrate the practicality of this approach through
                 an extensive experimental study: we compare TWM with
                 five other TMs, representative of typical alternative
                 design choices, and on a wide variety of benchmarks.
                 This study shows an average performance improvement
                 across all considered workloads and TMs of 65\% in high
                 concurrency scenarios, with gains extending up to $9
                 \times $ with the most favorable benchmarks. These
                 results are a consequence of TWM's ability to achieve
                 drastic reduction of aborts in scenarios of nonminimal
                 contention, while introducing little overhead
                 (approximately 10\%) in worst-case, synthetically
                 designed scenarios (i.e., no contention or contention
                 patterns that cannot be optimized using TWM).",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Eyraud-Dubois:2015:PST,
  author =       "Lionel Eyraud-Dubois and Loris Marchal and Oliver
                 Sinnen and Fr{\'e}d{\'e}ric Vivien",
  title =        "Parallel Scheduling of Task Trees with Limited
                 Memory",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2779052",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "This article investigates the execution of tree-shaped
                 task graphs using multiple processors. Each edge of
                 such a tree represents some large data. A task can only
                 be executed if all input and output data fit into
                 memory, and a data can only be removed from memory
                 after the completion of the task that uses it as an
                 input data. Such trees arise in the multifrontal method
                 of sparse matrix factorization. The peak memory needed
                 for the processing of the entire tree depends on the
                 execution order of the tasks. With one processor, the
                 objective of the tree traversal is to minimize the
                 required memory. This problem was well studied, and
                 optimal polynomial algorithms were proposed. Here, we
                 extend the problem by considering multiple processors,
                 which is of obvious interest in the application area of
                 matrix factorization. With multiple processors comes
                 the additional objective to minimize the time needed to
                 traverse the tree-that is, to minimize the makespan.
                 Not surprisingly, this problem proves to be much harder
                 than the sequential one. We study the computational
                 complexity of this problem and provide
                 inapproximability results even for unit weight trees.
                 We design a series of practical heuristics achieving
                 different trade-offs between the minimization of peak
                 memory usage and makespan. Some of these heuristics are
                 able to process a tree while keeping the memory usage
                 under a given memory limit. The different heuristics
                 are evaluated in an extensive experimental evaluation
                 using realistic trees.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dinitz:2015:ISI,
  author =       "Michael Dinitz and Torsten Hoefler",
  title =        "Introduction to the Special Issue on {SPAA 2013}",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809923",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14e",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kumar:2015:FGA,
  author =       "Ravi Kumar and Benjamin Moseley and Sergei
                 Vassilvitskii and Andrea Vattani",
  title =        "Fast Greedy Algorithms in {MapReduce} and Streaming",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809814",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Greedy algorithms are practitioners' best
                 friends---they are intuitive, are simple to implement,
                 and often lead to very good solutions. However,
                 implementing greedy algorithms in a distributed setting
                 is challenging since the greedy choice is inherently
                 sequential, and it is not clear how to take advantage
                 of the extra processing power. Our main result is a
                 powerful sampling technique that aids in
                 parallelization of sequential algorithms. Armed with
                 this primitive, we then adapt a broad class of greedy
                 algorithms to the MapReduce paradigm; this class
                 includes maximum cover and submodular maximization
                 subject to p -system constraint problems. Our method
                 yields efficient algorithms that run in a logarithmic
                 number of rounds while obtaining solutions that are
                 arbitrarily close to those produced by the standard
                 sequential greedy algorithm. We begin with algorithms
                 for modular maximization subject to a matroid
                 constraint and then extend this approach to obtain
                 approximation algorithms for submodular maximization
                 subject to knapsack or p -system constraints.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sanders:2015:WEM,
  author =       "Peter Sanders and Jochen Speck and Raoul Steffen",
  title =        "Work-Efficient Matrix Inversion in Polylogarithmic
                 Time",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809812",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present an algorithm for inversion of symmetric
                 positive definite matrices that combines the practical
                 requirement of an optimal number of arithmetic
                 operations and the theoretical goal of a
                 polylogarithmic critical path length. The algorithm
                 reduces inversion to matrix multiplication. It uses
                 Strassen's recursion scheme, but on the critical path
                 it breaks the recursion early, switching to an
                 asymptotically inefficient yet fast use of Newton's
                 method. We also show that the algorithm is numerically
                 stable. Overall, we get a candidate for a massively
                 parallel algorithm that scales to exascale systems even
                 on relatively small inputs.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gilbert:2015:SBO,
  author =       "Seth Gilbert and Chaodong Zheng",
  title =        "{SybilCast}: Broadcast on the Open Airwaves",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809810",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Consider a scenario where many wireless users are
                 attempting to download data from a single base station.
                 While most of the users are honest, some users may be
                 malicious and attempt to obtain more than their fair
                 share of the bandwidth. One possible strategy for
                 attacking the system is to simulate multiple fake
                 identities, each of which is given its own equal share
                 of the bandwidth. Such an attack is often referred to
                 as a sybil attack. To counter such behavior, we propose
                 SybilCast, a protocol for multichannel wireless
                 networks that limits the number of fake identities and,
                 in doing so, ensures that each honest user gets at
                 least a constant fraction of his or her fair share of
                 the bandwidth. As a result, each honest user can
                 complete his or her data download in asymptotically
                 optimal time. A key aspect of this protocol is
                 balancing the rate at which new identities are admitted
                 and the maximum number of fake identities that can
                 coexist while keeping the overhead low. Besides sybil
                 attacks, our protocol can also tolerate spoofing and
                 jamming.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Lee:2015:FPP,
  author =       "I-Ting Angelina Lee and Charles E. Leiserson and Tao
                 B. Schardl and Zhunping Zhang and Jim Sukha",
  title =        "On-the-Fly Pipeline Parallelism",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809808",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Pipeline parallelism organizes a parallel program as a
                 linear sequence of stages. Each stage processes
                 elements of a data stream, passing each processed data
                 element to the next stage, and then taking on a new
                 element before the subsequent stages have necessarily
                 completed their processing. Pipeline parallelism is
                 used especially in streaming applications that perform
                 video, audio, and digital signal processing. Three out
                 of 13 benchmarks in PARSEC, a popular software
                 benchmark suite designed for shared-memory
                 multiprocessors, can be expressed as pipeline
                 parallelism. Whereas most concurrency platforms that
                 support pipeline parallelism use a
                 ``construct-and-run'' approach, this article
                 investigates ``on-the-fly'' pipeline parallelism, where
                 the structure of the pipeline emerges as the program
                 executes rather than being specified a priori.
                 On-the-fly pipeline parallelism allows the number of
                 stages to vary from iteration to iteration and
                 dependencies to be data dependent. We propose simple
                 linguistics for specifying on-the-fly pipeline
                 parallelism and describe a provably efficient
                 scheduling algorithm, the P iper algorithm, which
                 integrates pipeline parallelism into a work-stealing
                 scheduler, allowing pipeline and fork-join parallelism
                 to be arbitrarily nested. The Piper algorithm
                 automatically throttles the parallelism, precluding
                 ``runaway'' pipelines. Given a pipeline computation
                 with $ T_1 $ work and $ T_\infty $ span (critical-path
                 length), Piper executes the computation on $P$
                 processors in $ T_P \leq T_1 / P + O(T \infty + \lg P)$
                 expected time. Piper also limits stack space, ensuring
                 that it does not grow unboundedly with running time. We
                 have incorporated on-the-fly pipeline parallelism into
                 a Cilk-based work-stealing runtime system. Our
                 prototype Cilk-P implementation exploits optimizations
                 such as ``lazy enabling'' and ``dependency folding.''
                 We have ported the three PARSEC benchmarks that exhibit
                 pipeline parallelism to run on Cilk-P. One of these,
                 x264, cannot readily be executed by systems that
                 support only construct-and-run pipeline parallelism.
                 Benchmark results indicate that Cilk-P has low serial
                 overhead and good scalability. On x264, for example,
                 Cilk-P exhibits a speedup of 13.87 over its respective
                 serial counterpart when running on 16 processors.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Eikel:2015:IRI,
  author =       "Martina Eikel and Christian Scheideler",
  title =        "{IRIS}: a Robust Information System Against Insider
                 {DoS} Attacks",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809806",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this work, we present the first scalable
                 distributed information system, that is, a system with
                 low storage overhead, that is provably robust against
                 denial-of-service (DoS) attacks by a current insider.
                 We allow a current insider to have complete knowledge
                 about the information system and to have the power to
                 block any \varepsilon-fraction of its servers by a DoS
                 attack, where \varepsilon can be chosen up to a
                 constant. The task of the system is to serve any
                 collection of lookup requests with at most one per
                 nonblocked server in an efficient way despite this
                 attack. Previously, scalable solutions were only known
                 for DoS attacks of past insiders, where a past insider
                 only has complete knowledge about some past time point
                 $ t_0 $ of the information system. Scheideler et al.
                 [Awerbuch and Scheideler 2007; Baumgart et al. 2009]
                 showed that in this case, it is possible to design an
                 information system so that any information that was
                 inserted or last updated after $ t_0 $ is safe against
                 a DoS attack. But their constructions would not work at
                 all for a current insider. The key idea behind our IRIS
                 system is to make extensive use of coding. More
                 precisely, we present two alternative distributed
                 coding strategies with an at most logarithmic storage
                 overhead that can handle up to a constant fraction of
                 blocked servers.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kling:2015:PSM,
  author =       "Peter Kling and Peter Pietrzyk",
  title =        "Profitable Scheduling on Multiple Speed-Scalable
                 Processors",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809872",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present a new online algorithm for profit-oriented
                 scheduling on multiple speed-scalable processors and
                 provide a tight analysis of the algorithm's
                 competitiveness. Our results generalize and improve
                 upon work by Chan et al. [2010], which considers a
                 single speed-scalable processor. Using significantly
                 different techniques, we can not only extend their
                 model to multiprocessors but also prove an enhanced and
                 tight competitive ratio for our algorithm. In our
                 scheduling problem, jobs arrive over time and are
                 preemptable. They have different workloads, values, and
                 deadlines. The scheduler may decide not to finish a job
                 but instead to suffer a loss equaling the job's value.
                 However, to process a job's workload until its deadline
                 the scheduler must invest a certain amount of energy.
                 The cost of a schedule is the sum of lost values and
                 invested energy. In order to finish a job, the
                 scheduler has to determine which processors to use and
                 set their speeds accordingly. A processor's energy
                 consumption is power $ P_\alpha (s) $ integrated over
                 time, where $ P_\alpha (s) = s^\alpha $ is the power
                 consumption when running at speed $s$. Since we
                 consider the online variant of the problem, the
                 scheduler has no knowledge about future jobs. This
                 problem was introduced by Chan et al. [2010] for the
                 case of a single processor. They presented an online
                 algorithm that is $ \alpha^\alpha + 2 e \alpha
                 $-competitive. We provide an online algorithm for the
                 case of multiple processors with an improved
                 competitive ratio of $ \alpha^\alpha $.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dutta:2015:CBR,
  author =       "Chinmoy Dutta and Gopal Pandurangan and Rajmohan
                 Rajaraman and Scott Roche",
  title =        "Coalescing-Branching Random Walks on Graphs",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2817830",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study a distributed randomized information
                 propagation mechanism in networks we call the
                 coalescing-branching random walk (cobra walk, for
                 short). A cobra walk is a generalization of the
                 well-studied ``standard'' random walk, and is useful in
                 modeling and understanding the Susceptible-Infected-
                 Susceptible (SIS)-type of epidemic processes in
                 networks. It can also be helpful in performing
                 light-weight information dissemination in
                 resource-constrained networks. A cobra walk is
                 parameterized by a branching factor $k$. The process
                 starts from an arbitrary vertex, which is labeled
                 active for step 1. In each step of a cobra walk, each
                 active vertex chooses $k$ random neighbors to become
                 active for the next step (``branching''). A vertex is
                 active for step $ t + 1$ only if it is chosen by an
                 active vertex in step $t$ (``coalescing''). This
                 results in a stochastic process in the underlying
                 network with properties that are quite different from
                 both the standard random walk (which is equivalent to
                 the cobra walk with branching factor 1) as well as
                 other gossip-based rumor spreading mechanisms. We focus
                 on the cover time of the cobra walk, which is the
                 number of steps for the walk to reach all the vertices,
                 and derive almost-tight bounds for various graph
                 classes. We show an $ O(\log^2 n)$ high probability
                 bound for the cover time of cobra walks on expanders,
                 if either the expansion factor or the branching factor
                 is sufficiently large; we also obtain an $ O(\log n)$
                 high probability bound for the partial cover time,
                 which is the number of steps needed for the walk to
                 reach at least a constant fraction of the vertices. We
                 also show that the cover time of the cobra walk is,
                 with high probability, $ O(n \log n)$ on any $n$-vertex
                 tree for $ k \geq 2$, $ {\~ O}(n^{1 / d})$ on a
                 $d$-dimensional grid for $ k \geq 2$, and $ O(\log n)$
                 on the complete graph.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Larus:2016:ISI,
  author =       "James Larus and Sandhya Dwarkadas and Jos{\'e} Moreira
                 and Andrew Lumsdaine",
  title =        "Introduction to the Special Issue on {PPoPP'14}",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2856513",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Herlihy:2016:WSF,
  author =       "Maurice Herlihy and Zhiyu Liu",
  title =        "Well-Structured Futures and Cache Locality",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "22:1--22:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858650",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In fork-join parallelism, a sequential program is
                 split into a directed acyclic graph of tasks linked by
                 directed dependency edges, and the tasks are executed,
                 possibly in parallel, in an order consistent with their
                 dependencies. A popular and effective way to extend
                 fork-join parallelism is to allow threads to create
                 futures. A thread creates a future to hold the results
                 of a computation, which may or may not be executed in
                 parallel. That result is returned when some thread
                 touches that future, blocking if necessary until the
                 result is ready. Recent research has shown that
                 although futures can, of course, enhance parallelism in
                 a structured way, they can have a deleterious effect on
                 cache locality. In the worst case, futures can incur $
                 \Omega (P T_\infty + t T_\infty) $ deviations, which
                 implies $ \Omega (C P T_\infty + C t T_\infty) $
                 additional cache misses, where $C$ is the number of
                 cache lines, $P$ is the number of processors, $t$ is
                 the number of touches, and $ T_\infty $ is the
                 computation span. Since cache locality has a large
                 impact on software performance on modern multicores,
                 this result is troubling. In this article, we show that
                 if futures are used in a simple, disciplined way, then
                 the situation is much better: if each future is touched
                 only once, either by the thread that created it or by a
                 later descendant of the thread that created it, then
                 parallel executions with work stealing can incur at
                 most $ O(C P T^2_\infty)$ additional cache misses-a
                 substantial improvement. This structured use of futures
                 is characteristic of many (but not all) parallel
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Thomson:2016:CTU,
  author =       "Paul Thomson and Alastair F. Donaldson and Adam
                 Betts",
  title =        "Concurrency Testing Using Controlled Schedulers: an
                 Empirical Study",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "23:1--23:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858651",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present an independent empirical study on
                 concurrency testing using controlled schedulers. We
                 have gathered 49 buggy concurrent software benchmarks,
                 drawn from public code bases, which we call SCTBench.
                 We applied a modified version of an existing
                 concurrency testing tool to SCTBench, testing five
                 controlled scheduling techniques: depth-first search,
                 preemption bounding, delay bounding, a controlled
                 random scheduler, and probabilistic concurrency testing
                 (PCT). We attempt to answer several research questions:
                 Which technique performs the best, in terms of bug
                 finding ability? How effective are the two main
                 schedule bounding techniques-preemption bounding and
                 delay bounding-at finding bugs? What challenges are
                 associated with applying concurrency testing techniques
                 to existing code? Can we classify certain benchmarks as
                 trivial or nontrivial? Overall, we found that PCT (with
                 parameter d = 3) was the most effective technique in
                 terms of bug finding; it found all bugs found by the
                 other techniques, plus an additional three, and it
                 missed only one bug. Surprisingly, we found that the
                 naive controlled random scheduler, which randomly
                 chooses one thread to execute at each scheduling point,
                 performs well, finding more bugs than preemption
                 bounding and just two fewer bugs than delay bounding.
                 Our findings confirm that delay bounding is superior to
                 preemption bounding and that schedule bounding is
                 superior to an unbounded depth-first search. The
                 majority of bugs in SCTBench can be exposed using a
                 small schedule bound (1--2), supporting previous
                 claims, although one benchmark requires five
                 preemptions. We found that the need to remove
                 nondeterminism and control all synchronization (as is
                 required for systematic concurrency testing) can be
                 nontrivial. There were eight distinct programs that
                 could not easily be included in out study, such as
                 those that perform network and interprocess
                 communication. We report various properties about the
                 benchmarks tested, such as the fact that the bugs in 18
                 benchmarks were exposed 50\% of the time when using
                 random scheduling. We note that future work should not
                 use the benchmarks that we classify as trivial when
                 presenting new techniques, other than as a minimum
                 baseline. We have made SCTBench and our tools publicly
                 available for reproducibility and use in future work.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Petrovic:2016:LHM,
  author =       "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e}
                 Schiper",
  title =        "Leveraging Hardware Message Passing for Efficient
                 Thread Synchronization",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "24:1--24:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858652",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As the level of parallelism in manycore processors
                 keeps increasing, providing efficient mechanisms for
                 thread synchronization in concurrent programs is
                 becoming a major concern. On cache-coherent
                 shared-memory processors, synchronization efficiency is
                 ultimately limited by the performance of the underlying
                 cache coherence protocol. This article studies how
                 hardware support for message passing can improve
                 synchronization performance. Considering the ubiquitous
                 problem of mutual exclusion, we devise novel algorithms
                 for (i) classic locking, where application threads
                 obtain exclusive access to a shared resource prior to
                 executing their critical sections (CSes), and (ii)
                 delegation, where CSes are executed by special threads.
                 For classic locking, our HybLock algorithm uses a mix
                 of shared memory and hardware message passing, which
                 introduces the idea of hybrid synchronization
                 algorithms. For delegation, we propose mp-server and
                 HybComb: the former is a straightforward adaptation of
                 the server approach to hardware message passing,
                 whereas the latter is a novel hybrid combining
                 algorithm. Evaluation on Tilera's TILE-Gx processor
                 shows that HybLock outperforms the best known classic
                 locks. Furthermore, mp-server can execute contended
                 CSes with unprecedented throughput, as stalls related
                 to cache coherence are removed from the critical path.
                 HybComb can achieve comparable performance while
                 avoiding the need to dedicate server cores.
                 Consequently, our queue and stack implementations,
                 based on the new synchronization algorithms, largely
                 outperform their most efficient shared-memory-only
                 counterparts.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Tardieu:2016:XAP,
  author =       "Olivier Tardieu and Benjamin Herta and David
                 Cunningham and David Grove and Prabhanjan Kambadur and
                 Vijay Saraswat and Avraham Shinnar and Mikio Takeuchi
                 and Mandana Vaziri and Wei Zhang",
  title =        "{X10} and {APGAS} at Petascale",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "25:1--25:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2894746",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "X10 is a high-performance, high-productivity
                 programming language aimed at large-scale distributed
                 and shared-memory parallel applications. It is based on
                 the Asynchronous Partitioned Global Address Space
                 (APGAS) programming model, supporting the same
                 fine-grained concurrency mechanisms within and across
                 shared-memory nodes. We demonstrate that X10 delivers
                 solid performance at petascale by running (weak
                 scaling) eight application kernels on an IBM Power--775
                 supercomputer utilizing up to 55,680 Power7 cores (for
                 1.7Pflop/s of theoretical peak performance). For the
                 four HPC Class 2 Challenge benchmarks, X10 achieves
                 41\% to 87\% of the system's potential at scale (as
                 measured by IBM's HPCC Class 1 optimized runs). We also
                 implement K-Means, Smith-Waterman, Betweenness
                 Centrality, and Unbalanced Tree Search (UTS) for
                 geometric trees. Our UTS implementation is the first to
                 scale to petaflop systems. We describe the advances in
                 distributed termination detection, distributed load
                 balancing, and use of high-performance interconnects
                 that enable X10 to scale out to tens of thousands of
                 cores. We discuss how this work is driving the
                 evolution of the X10 language, core class libraries,
                 and runtime systems.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Maleki:2016:LRM,
  author =       "Saeed Maleki and Madanlal Musuvathi and Todd
                 Mytkowicz",
  title =        "Low-Rank Methods for Parallelizing Dynamic Programming
                 Algorithms",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "26:1--26:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2884065",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "This article proposes efficient parallel methods for
                 an important class of dynamic programming problems that
                 includes Viterbi, Needleman--Wunsch, Smith--Waterman,
                 and Longest Common Subsequence. In dynamic programming,
                 the subproblems that do not depend on each other, and
                 thus can be computed in parallel, form stages or
                 wavefronts. The methods presented in this article
                 provide additional parallelism allowing multiple stages
                 to be computed in parallel despite dependencies among
                 them. The correctness and the performance of the
                 algorithm relies on rank convergence properties of
                 matrix multiplication in the tropical semiring, formed
                 with plus as the multiplicative operation and max as
                 the additive operation. This article demonstrates the
                 efficiency of the parallel algorithm by showing
                 significant speedups on a variety of important dynamic
                 programming problems. In particular, the parallel
                 Viterbi decoder is up to $ 24 \times $ faster (with 64
                 processors) than a highly optimized commercial
                 baseline.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Yuan:2016:FCN,
  author =       "Xin Yuan and Wickus Nienaber and Santosh Mahapatra",
  title =        "On Folded-{Clos} Networks with Deterministic
                 Single-Path Routing",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "27:1--27:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858654",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Folded-Clos networks, also known as fat-trees, have
                 been widely used as interconnects in large-scale
                 high-performance computing clusters. Although users
                 often treat such interconnects as replacements of
                 nonblocking crossbar switches that can carry out any
                 permutation communication without contention, the
                 networking capability of such interconnects without a
                 centralized controller in computer communication
                 environments is not well understood. In this article,
                 we investigate nonblocking two-level folded-Clos
                 networks with deterministic single-path routing, but no
                 centralized controller, and establish the nonblocking
                 condition. The results indicate that nonblocking
                 two-level folded-Clos networks without a centralized
                 controller are much more expensive to construct than
                 the traditional nonblocking networks in the
                 telecommunication environment. Practical two-level
                 folded-Clos based interconnects are blocking. For such
                 interconnects, we establish the lower bound for
                 worst-case contention for permutations with any
                 deterministic single-path routing scheme, show that
                 existing routing schemes perform poorly in terms of
                 worst-case contention for permutations, present a
                 routing scheme that achieves the theoretical optimal,
                 and empirically compare the performance of existing
                 schemes with the optimal routing scheme. The techniques
                 developed for two-level folded-Clos networks are
                 further extended for the general fat-trees of any
                 heights.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Sandes:2016:MMA,
  author =       "Edans F. De O. Sandes and Guillermo Miranda and Xavier
                 Martorell and Eduard Ayguade and George Teodoro and
                 Alba C. M. A. {De Melo}",
  title =        "{MASA}: a Multiplatform Architecture for Sequence
                 Aligners with Block Pruning",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "28:1--28:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858656",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Biological sequence alignment is a very popular
                 application in Bioinformatics, used routinely
                 worldwide. Many implementations of biological sequence
                 alignment algorithms have been proposed for multicores,
                 GPUs, FPGAs and CellBEs. These implementations are
                 platform-specific; porting them to other systems
                 requires considerable programming effort. This article
                 proposes and evaluates MASA, a flexible and
                 customizable software architecture that enables the
                 execution of biological sequence alignment applications
                 with three variants (local, global, and semiglobal) in
                 multiple hardware/software platforms with block
                 pruning, which is able to reduce significantly the
                 amount of data processed. To attain our flexibility
                 goals, we also propose a generic version of block
                 pruning and developed multiple parallelization
                 strategies as building blocks, including a new
                 asynchronous dataflow-based parallelization, which may
                 be combined to implement efficient aligners in
                 different platforms. We provide four MASA aligner
                 implementations for multicores (OmpSs and OpenMP), GPU
                 (CUDA), and Intel Phi (OpenMP), showing that MASA is
                 very flexible. The evaluation of our generic block
                 pruning strategy shows that it significantly
                 outperforms the previously proposed block pruning,
                 being able to prune up to 66.5\% of the cells when
                 using the new dataflow-based parallelization
                 strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{MeyeraufderHeide:2016:ISI,
  author =       "Friedhelm {Meyer auf der Heide} and Peter Sanders and
                 Nodari Sitchinava",
  title =        "Introduction to the Special Issue on {SPAA 2014}",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2936716",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kaler:2016:EDD,
  author =       "Tim Kaler and William Hasenplaugh and Tao B. Schardl
                 and Charles E. Leiserson",
  title =        "Executing Dynamic Data-Graph Computations
                 Deterministically Using Chromatic Scheduling",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2896850",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "A data-graph computation-popularized by such
                 programming systems as Galois, Pregel, GraphLab,
                 PowerGraph, and GraphChi-is an algorithm that performs
                 local updates on the vertices of a graph. During each
                 round of a data-graph computation, an update function
                 atomically modifies the data associated with a vertex
                 as a function of the vertex's prior data and that of
                 adjacent vertices. A dynamic data-graph computation
                 updates only an active subset of the vertices during a
                 round, and those updates determine the set of active
                 vertices for the next round. This article introduces
                 Prism, a chromatic-scheduling algorithm for executing
                 dynamic data-graph computations. Prism uses a vertex
                 coloring of the graph to coordinate updates performed
                 in a round, precluding the need for mutual-exclusion
                 locks or other nondeterministic data synchronization. A
                 multibag data structure is used by Prism to maintain a
                 dynamic set of active vertices as an unordered set
                 partitioned by color. We analyze Prism using work-span
                 analysis. Let $ G = (V, E) $ be a degree-$ \Delta $
                 graph colored with \chi colors, and suppose that $ Q
                 \subseteq V $ is the set of active vertices in a round.
                 Define $ {\rm size} (Q) = | Q | + \Sigma_{v \in Q} {\rm
                 deg}(v) $, which is proportional to the space required
                 to store the vertices of $Q$ using a sparse-graph
                 layout. We show that a $P$-processor execution of Prism
                 performs updates in $Q$ using $O(\chi (l g (Q / \chi)
                 + l g \Delta)) + l g P$ span and $ \Theta (s i z e (Q)
                 + P)$ work. These theoretical guarantees are matched by
                 good empirical performance. To isolate the effect of
                 the scheduling algorithm on performance, we modified
                 GraphLab to incorporate Prism and studied seven
                 application benchmarks on a 12-core multicore machine.
                 Prism executes the benchmarks 1.2 to 2.1 times faster
                 than GraphLab's nondeterministic lock-based scheduler
                 while providing deterministic behavior. This article
                 also presents Prism-R, a variation of Prism that
                 executes dynamic data-graph computations
                 deterministically even when updates modify global
                 variables with associative operations. Prism-R
                 satisfies the same theoretical bounds as Prism, but its
                 implementation is more involved, incorporating a
                 multivector data structure to maintain a
                 deterministically ordered set of vertices partitioned
                 by color. Despite its additional complexity, Prism-R is
                 only marginally slower than Prism. On the seven
                 application benchmarks studied, Prism-R incurs a 7\%
                 geometric mean overhead relative to Prism.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Im:2016:CST,
  author =       "Sungjin Im and Benjamin Moseley and Kirk Pruhs and
                 Eric Torng",
  title =        "Competitively Scheduling Tasks with Intermediate
                 Parallelizability",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938378",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We introduce a scheduling algorithm Intermediate-SRPT,
                 and show that it is $O (\log P)$-competitive with respect
                 to average flow time when scheduling jobs whose
                 parallelizability is intermediate between being fully
                 parallelizable and sequential. Here, the parameter P
                 denotes the ratio between the maximum job size to the
                 minimum. We also show a general matching lower bound on
                 the competitive ratio. Our analysis builds on an
                 interesting combination of potential function and local
                 competitiveness arguments.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bercea:2016:CMI,
  author =       "Ioana O. Bercea and Navin Goyal and David G. Harris
                 and Aravind Srinivasan",
  title =        "On Computing Maximal Independent Sets of Hypergraphs
                 in Parallel",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938436",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Whether or not the problem of finding maximal
                 independent sets (MIS) in hypergraphs is in (R)NC is
                 one of the fundamental problems in the theory of
                 parallel computing. Essentially, the challenge is to
                 design (randomized) algorithms in which the number of
                 processors used is polynomial and the (expected)
                 runtime is polylogarithmic in the size of the input.
                 Unlike the well-understood case of MIS in graphs, for
                 the hypergraph problem, our knowledge is quite limited
                 despite considerable work. It is known that the problem
                 is in RNC when the edges of the hypergraph have
                 constant size. For general hypergraphs with n vertices
                 and m edges, the fastest previously known algorithm
                 works in time $O(\sqrt n) $ with $ \poly (m, n) $
                 processors. In this article, we give an EREW PRAM
                 randomized algorithm that works in time $ n^{o (1)} $
                 with $O(n + m \log n) $ processors on general
                 hypergraphs satisfying $ m \leq n^{o (1) \log \log n /
                 \log \log \log n} $. We also give an EREW PRAM
                 deterministic algorithm that runs in time $ n^\epsilon
                 $ on a graph with $ m \leq n^{1 / \delta } $ edges, for
                 any constants $ \delta $, $ \epsilon $; the number of
                 processors is polynomial in $m$, $n$ for a fixed choice
                 of $ \delta $, \epsilon . Our algorithms are based on a
                 sampling idea that reduces the dimension of the
                 hypergraph and employs the algorithm for constant
                 dimension hypergraphs as a subroutine.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bilo:2016:LBN,
  author =       "Davide Bil{\`o} and Luciano Gual{\`a} and Stefano
                 Leucci and Guido Proietti",
  title =        "Locality-Based Network Creation Games",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938426",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Network creation games have been extensively studied,
                 both by economists and computer scientists, due to
                 their versatility in modeling individual-based
                 community formation processes. These processes, in
                 turn, are the theoretical counterpart of several
                 economics, social, and computational applications on
                 the Internet. In their several variants, these games
                 model the tension of a player between the player's two
                 antagonistic goals: to be as close as possible to the
                 other players and to activate a cheapest possible set
                 of links. However, the generally adopted assumption is
                 that players have a common and complete information
                 about the ongoing network, which is quite unrealistic
                 in practice. In this article, we consider a more
                 compelling scenario in which players have only limited
                 information about the network in which they are
                 embedded. More precisely, we explore the game-theoretic
                 and computational implications of assuming that players
                 have a complete knowledge of the network structure only
                 up to a given radius k, which is one of the most
                 qualified local-knowledge models used in distributed
                 computing. In this respect, we define a suitable
                 equilibrium concept, and we provide a comprehensive set
                 of upper and lower bounds to the price of anarchy for
                 the entire range of values of k and for the two classic
                 variants of the game, namely, those in which a player's
                 cost-besides the activation cost of the owned
                 links-depends on the maximum/sum of all distances to
                 the other nodes in the network, respectively. These
                 bounds are assessed through an extensive set of
                 experiments.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jiang:2016:PPA,
  author =       "Jiayang Jiang and Michael Mitzenmacher and Justin
                 Thaler",
  title =        "Parallel Peeling Algorithms",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938412",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The analysis of several algorithms and data structures
                 can be framed as a peeling process on a random
                 hypergraph: vertices with degree less than k are
                 removed until there are no vertices of degree less than
                 k left. The remaining hypergraph is known as the k
                 core. In this article, we analyze parallel peeling
                 processes, in which in each round, all vertices of
                 degree less than k are removed. It is known that, below
                 a specific edge-density threshold, the k -core is empty
                 with high probability. We show that, with high
                 probability, below this threshold, only $1 / \log((k -
                 1) (r - 1)) \log \log n + O (1)$ rounds of peeling are
                 needed to obtain the empty $k$-core for $4br$-uniform
                 hypergraphs. This bound is tight up to an additive
                 constant. Interestingly, we show that, above this
                 threshold, $\Omega (\log n)$ rounds of peeling are
                 required to find the nonempty $k$-core. Since most
                 algorithms and data structures aim to peel to an empty
                 $k$-core, this asymmetry appears fortunate. We verify
                 the theoretical results both with simulation and with a
                 parallel implementation using graphics processing units
                 (GPUs). Our implementation provides insights into how
                 to structure parallel peeling algorithms for efficiency
                 in practice.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Simhadri:2016:EAS,
  author =       "Harsha Vardhan Simhadri and Guy E. Blelloch and Jeremy
                 T. Fineman and Phillip B. Gibbons and Aapo Kyrola",
  title =        "Experimental Analysis of Space-Bounded Schedulers",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938389",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The running time of nested parallel programs on
                 shared-memory machines depends in significant part on
                 how well the scheduler mapping the program to the
                 machine is optimized for the organization of caches and
                 processor cores on the machine. Recent work proposed
                 ``space-bounded schedulers'' for scheduling such
                 programs on the multilevel cache hierarchies of current
                 machines. The main benefit of this class of schedulers
                 is that they provably preserve locality of the program
                 at every level in the hierarchy, which can result in
                 fewer cache misses and better use of bandwidth than the
                 popular work-stealing scheduler. On the other hand,
                 compared to work stealing, space-bounded schedulers are
                 inferior at load balancing and may have greater
                 scheduling overheads, raising the question as to the
                 relative effectiveness of the two schedulers in
                 practice. In this article, we provide the first
                 experimental study aimed at addressing this question.
                 To facilitate this study, we built a flexible
                 experimental framework with separate interfaces for
                 programs and schedulers. This enables a head-to-head
                 comparison of the relative strengths of schedulers in
                 terms of running times and cache miss counts across a
                 range of benchmarks. (The framework is validated by
                 comparisons with the Intel{\reg} CilkTM Plus
                 work-stealing scheduler.) We present experimental
                 results on a 32-core Xeon{\reg} 7560 comparing work
                 stealing, hierarchy-minded work stealing, and two
                 variants of space-bounded schedulers on both
                 divide-and-conquer microbenchmarks and some popular
                 algorithmic kernels. Our results indicate that
                 space-bounded schedulers reduce the number of L3 cache
                 misses compared to work-stealing schedulers by 25\% to
                 65\% for most of the benchmarks, but incur up to 27\%
                 additional scheduler and load-imbalance overhead. Only
                 for memory-intensive benchmarks can the reduction in
                 cache misses overcome the added overhead, resulting in
                 up to a 25\% improvement in running time for synthetic
                 benchmarks and about 20\% improvement for algorithmic
                 kernels. We also quantify runtime improvements varying
                 the available bandwidth per core (the ``bandwidth
                 gap'') and show up to 50\% improvements in the running
                 times of kernels as this gap increases fourfold. As
                 part of our study, we generalize prior definitions of
                 space-bounded schedulers to allow for more practical
                 variants (while still preserving their guarantees) and
                 explore implementation tradeoffs.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sheikh:2016:SHJ,
  author =       "Hafiz Fahad Sheikh and Ishfaq Ahmad",
  title =        "Sixteen Heuristics for Joint Optimization of
                 Performance, Energy, and Temperature in Allocating
                 Tasks to Multi-Cores",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948973",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Three-way joint optimization of performance ( P ),
                 energy ( E ), and temperature ( T ) in scheduling
                 parallel tasks to multiple cores poses a challenge that
                 is staggering in its computational complexity. The goal
                 of the PET optimized scheduling ( PETOS ) problem is to
                 minimize three quantities: the completion time of a
                 task graph, the total energy consumption, and the peak
                 temperature of the system. Algorithms based on
                 conventional multi-objective optimization techniques
                 can be designed for solving the PETOS problem. But
                 their execution times are exceedingly high and hence
                 their applicability is restricted merely to problems of
                 modest size. Exacerbating the problem is the solution
                 space that is typically a Pareto front since no single
                 solution can be strictly best along all three
                 objectives. Thus, not only is the absolute quality of
                 the solutions important but ``the spread of the
                 solutions'' along each objective and the distribution
                 of solutions within the generated tradeoff front are
                 also desired. A natural alternative is to design
                 efficient heuristic algorithms that can generate good
                 solutions as well as good spreads --- note that most of
                 the prior work in energy-efficient task allocation is
                 predominantly single- or dual-objective oriented. Given
                 a directed acyclic graph (DAG) representing a parallel
                 program, a heuristic encompasses policies as to what
                 tasks should go to what cores and at what frequency
                 should that core operate. Various policies, such as
                 greedy, iterative, and probabilistic, can be employed.
                 However, the choice and usage of these policies can
                 influence a heuristic towards a particular objective
                 and can also profoundly impact its performance. This
                 article proposes 16 heuristics that utilize various
                 methods for task-to-core allocation and frequency
                 selection. This article also presents a methodical
                 classification scheme which not only categorizes the
                 proposed heuristics but can also accommodate additional
                 heuristics. Extensive simulation experiments compare
                 these algorithms while shedding light on their
                 strengths and tradeoffs.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Blanchard:2016:SMO,
  author =       "Jeffrey D. Blanchard and Erik Opavsky and Emircan
                 Uysaler",
  title =        "Selecting Multiple Order Statistics with a Graphics
                 Processing Unit",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948974",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Extracting a set of multiple order statistics from a
                 huge data set provides important information about the
                 distribution of the values in the full set of data.
                 This article introduces an algorithm,
                 bucketMultiSelect, for simultaneously selecting
                 multiple order statistics with a graphics processing
                 unit (GPU). Typically, when a large set of order
                 statistics is desired, the vector is sorted. When the
                 sorted version of the vector is not needed,
                 bucketMultiSelect significantly reduces computation
                 time by eliminating a large portion of the unnecessary
                 operations involved in sorting. For large vectors,
                 bucketMultiSelect returns thousands of order statistics
                 in less time than sorting the vector while typically
                 using less memory. For vectors containing $2^{28}$
                 values of type double, bucketMultiSelect selects the
                 101 percentile order statistics in less than 95ms and
                 is more than $8 \times $ faster than sorting the vector
                 with a GPU optimized merge sort.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bohme:2016:IRC,
  author =       "David B{\"o}hme and Markus Geimer and Lukas Arnold and
                 Felix Voigtlaender and Felix Wolf",
  title =        "Identifying the Root Causes of Wait States in
                 Large-Scale Parallel Applications",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934661",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Driven by growing application requirements and
                 accelerated by current trends in microprocessor design,
                 the number of processor cores on modern supercomputers
                 is increasing from generation to generation. However,
                 load or communication imbalance prevents many codes
                 from taking advantage of the available parallelism, as
                 delays of single processes may spread wait states
                 across the entire machine. Moreover, when employing
                 complex point-to-point communication patterns, wait
                 states may propagate along far-reaching cause-effect
                 chains that are hard to track manually and that
                 complicate an assessment of the actual costs of an
                 imbalance. Building on earlier work by Meira, Jr., et
                 al., we present a scalable approach that identifies
                 program wait states and attributes their costs in terms
                 of resource waste to their original cause. By replaying
                 event traces in parallel both forward and backward, we
                 can identify the processes and call paths responsible
                 for the most severe imbalances, even for runs with
                 hundreds of thousands of processes.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dathathri:2016:CAL,
  author =       "Roshan Dathathri and Ravi Teja Mullapudi and Uday
                 Bondhugula",
  title =        "Compiling Affine Loop Nests for a Dynamic Scheduling
                 Runtime on Shared and Distributed Memory",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948975",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Current de-facto parallel programming models like
                 OpenMP and MPI make it difficult to extract task-level
                 dataflow parallelism as opposed to bulk-synchronous
                 parallelism. Task parallel approaches that use
                 point-to-point synchronization between dependent tasks
                 in conjunction with dynamic scheduling dataflow
                 runtimes are thus becoming attractive. Although good
                 performance can be extracted for both shared and
                 distributed memory using these approaches, there is
                 little compiler support for them. In this article, we
                 describe the design of compiler--runtime interaction to
                 automatically extract coarse-grained dataflow
                 parallelism in affine loop nests for both shared and
                 distributed-memory architectures. We use techniques
                 from the polyhedral compiler framework to extract tasks
                 and generate components of the runtime that are used to
                 dynamically schedule the generated tasks. The runtime
                 includes a distributed decentralized scheduler that
                 dynamically schedules tasks on a node. The schedulers
                 on different nodes cooperate with each other through
                 asynchronous point-to-point communication, and all of
                 this is achieved by code automatically generated by the
                 compiler. On a set of six representative affine loop
                 nest benchmarks, while running on 32 nodes with 8
                 threads each, our compiler-assisted runtime yields a
                 geometric mean speedup of $ 143.6 \times $ ($ 70.3
                 \times $ to $ 474.7 \times $) over the sequential
                 version and a geometric mean speedup of $ 1.64 \times $
                 ($ 1.04 \times $ to $ 2.42 \times $) over the
                 state-of-the-art automatic parallelization approach
                 that uses bulk synchronization. We also compare our
                 system with past work that addresses some of these
                 challenges on shared memory, and an emerging runtime
                 (Intel Concurrent Collections) that demands higher
                 programmer input and effort in parallelizing. To the
                 best of our knowledge, ours is also the first automatic
                 scheme that allows for dynamic scheduling of affine
                 loop nests on a cluster of multicores.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Benoit:2016:AGP,
  author =       "Anne Benoit and Aur{\'e}lien Cavelan and Yves Robert
                 and Hongyang Sun",
  title =        "Assessing General-Purpose Algorithms to Cope with
                 Fail-Stop and Silent Errors",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2897189",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this article, we combine the traditional
                 checkpointing and rollback recovery strategies with
                 verification mechanisms to cope with both fail-stop and
                 silent errors. The objective is to minimize makespan
                 and/or energy consumption. For divisible load
                 applications, we use first-order approximations to find
                 the optimal checkpointing period to minimize execution
                 time, with an additional verification mechanism to
                 detect silent errors before each checkpoint, hence
                 extending the classical formula by Young and Daly for
                 fail-stop errors only. We further extend the approach
                 to include intermediate verifications, and to consider
                 a bicriteria problem involving both time and energy
                 (linear combination of execution time and energy
                 consumption). Then, we focus on application workflows
                 whose dependence graph is a linear chain of tasks.
                 Here, we determine the optimal checkpointing and
                 verification locations, with or without intermediate
                 verifications, for the bicriteria problem. Rather than
                 using a single speed during the whole execution, we
                 further introduce a new execution scenario, which
                 allows for changing the execution speed via Dynamic
                 Voltage and Frequency Scaling (DVFS). In this latter
                 scenario, we determine the optimal checkpointing and
                 verification locations, as well as the optimal speed
                 pairs for each task segment between any two consecutive
                 checkpoints. Finally, we conduct an extensive set of
                 simulations to support the theoretical study, and to
                 assess the performance of each algorithm, showing that
                 the best overall performance is achieved under the most
                 flexible scenario using intermediate verifications and
                 different speeds.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Koutis:2016:SPD,
  author =       "Ioannis Koutis and Shen Chen Xu",
  title =        "Simple Parallel and Distributed Algorithms for
                 Spectral Graph Sparsification",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948062",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We describe simple algorithms for spectral graph
                 sparsification, based on iterative computations of
                 weighted spanners and sampling. Leveraging the
                 algorithms of Baswana and Sen for computing spanners,
                 we obtain the first distributed spectral sparsification
                 algorithm in the CONGEST model. We also obtain a
                 parallel algorithm with improved work and time
                 guarantees, as well as other natural distributed
                 implementations. Combining this algorithm with the
                 parallel framework of Peng and Spielman for solving
                 symmetric diagonally dominant linear systems, we get a
                 parallel solver that is significantly more efficient in
                 terms of the total work.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dorier:2016:DAP,
  author =       "Matthieu Dorier and Gabriel Antoniu and Franck
                 Cappello and Marc Snir and Robert Sisneros and Orcun
                 Yildiz and Shadi Ibrahim and Tom Peterka and Leigh
                 Orf",
  title =        "{Damaris}: Addressing Performance Variability in Data
                 Management for Post-Petascale Simulations",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2987371",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "With exascale computing on the horizon, reducing
                 performance variability in data management tasks
                 (storage, visualization, analysis, etc.) is becoming a
                 key challenge in sustaining high performance. This
                 variability significantly impacts the overall
                 application performance at scale and its predictability
                 over time. In this article, we present Damaris, a
                 system that leverages dedicated cores in multicore
                 nodes to offload data management tasks, including I/O,
                 data compression, scheduling of data movements, in situ
                 analysis, and visualization. We evaluate Damaris with
                 the CM1 atmospheric simulation and the Nek5000
                 computational fluid dynamic simulation on four
                 platforms, including NICS's Kraken and NCSA's Blue
                 Waters. Our results show that (1) Damaris fully hides
                 the I/O variability as well as all I/O-related costs,
                 thus making simulation performance predictable; (2) it
                 increases the sustained write throughput by a factor of
                 up to 15 compared with standard I/O approaches; (3) it
                 allows almost perfect scalability of the simulation up
                 to over 9,000 cores, as opposed to state-of-the-art
                 approaches that fail to scale; and (4) it enables a
                 seamless connection to the VisIt visualization software
                 to perform in situ analysis and visualization in a way
                 that impacts neither the performance of the simulation
                 nor its variability. In addition, we extended our
                 implementation of Damaris to also support the use of
                 dedicated nodes and conducted a thorough comparison of
                 the two approaches-dedicated cores and dedicated
                 nodes-for I/O tasks with the aforementioned
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gao:2016:AOM,
  author =       "Jiaquan Gao and Yu Wang and Jun Wang and Ronghua
                 Liang",
  title =        "Adaptive Optimization Modeling of Preconditioned
                 Conjugate Gradient on Multi-{GPUs}",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990849",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The preconditioned conjugate gradient (PCG) algorithm
                 is a well-known iterative method for solving sparse
                 linear systems in scientific computations.
                 GPU-accelerated PCG algorithms for large-sized problems
                 have attracted considerable attention recently.
                 However, on a specific multi-GPU platform, producing a
                 highly parallel PCG implementation for any large-sized
                 problem requires significant time because several
                 manual steps are involved in adjusting the related
                 parameters and selecting an appropriate storage format
                 for the matrix block that is assigned to each GPU. This
                 motivates us to propose adaptive optimization modeling
                 of PCG on multi-GPUs, which mainly involves the
                 following parts: (1) an optimization multi-GPU parallel
                 framework of PCG and (2) the profile-based optimization
                 modeling for each one of the main components of the PCG
                 algorithm, including vector operation, inner product,
                 and sparse matrix-vector multiplication (SpMV). Our
                 model does not construct a new storage format or kernel
                 but automatically and rapidly generates an optimal
                 parallel PCG algorithm for any problem on a specific
                 multi-GPU platform by integrating existing storage
                 formats and kernels. We take a vector operation kernel,
                 an inner-product kernel, and five popular SpMV kernels
                 for an example to present the idea of constructing the
                 model. Given that our model is general, independent of
                 the problems, and dependent on the resources of
                 devices, this model is constructed only once for each
                 type of GPU. The experiments validate the high
                 efficiency of our proposed model.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Creech:2016:TSS,
  author =       "Timothy Creech and Rajeev Barua",
  title =        "Transparently Space Sharing a Multicore Among Multiple
                 Processes",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001910",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As hardware becomes increasingly parallel and the
                 availability of scalable parallel software improves,
                 the problem of managing multiple multithreaded
                 applications (processes) becomes important. Malleable
                 processes, which can vary the number of threads used as
                 they run, enable sophisticated and flexible resource
                 management. Although many existing applications
                 parallelized for SMPs with parallel runtimes are in
                 fact already malleable, deployed runtime environments
                 provide no interface nor any strategy for intelligently
                 allocating hardware threads or even preventing
                 oversubscription. Prior research methods either depend
                 on profiling applications ahead of time to make good
                 decisions about allocations or do not account for
                 process efficiency at all, leading to poor performance.
                 None of these prior methods have been adapted widely in
                 practice. This article presents the Scheduling and
                 Allocation with Feedback (SCAF) system: a drop-in
                 runtime solution that supports existing malleable
                 applications in making intelligent allocation decisions
                 based on observed efficiency without any changes to
                 semantics, program modification, offline profiling, or
                 even recompilation. Our existing implementation can
                 control most unmodified OpenMP applications. Other
                 malleable threading libraries can also easily be
                 supported with small modifications without requiring
                 application modification or recompilation. In this
                 work, we present the SCAF daemon and a SCAF-aware port
                 of the GNU OpenMP runtime. We present a new technique
                 for estimating process efficiency purely at runtime
                 using available hardware counters and demonstrate its
                 effectiveness in aiding allocation decisions. We
                 evaluated SCAF using NAS NPB parallel benchmarks on
                 five commodity parallel platforms, enumerating
                 architectural features and their effects on our scheme.
                 We measured the benefit of SCAF in terms of sum of
                 speedups improvement (a common metric for
                 multiprogrammed environments) when running all
                 benchmark pairs concurrently compared to
                 equipartitioning-the best existing competing scheme in
                 the literature. We found that SCAF improves on
                 equipartitioning on four out of five machines, showing
                 a mean improvement factor in sum of speedups of 1.04 to
                 1.11x for benchmark pairs, depending on the machine,
                 and 1.09x on average. Since we are not aware of any
                 widely available tool for equipartitioning, we also
                 compare SCAF against multiprogramming using unmodified
                 OpenMP, which is the only environment available to end
                 users today. SCAF improves on the unmodified OpenMP
                 runtimes for all five machines, with a mean improvement
                 of 1.08 to 2.07x, depending on the machine, and 1.59x
                 on average.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2016:HPS,
  author =       "Grey Ballard and Alex Druinsky and Nicholas Knight and
                 Oded Schwartz",
  title =        "Hypergraph Partitioning for Sparse Matrix--Matrix
                 Multiplication",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3015144",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We propose a fine-grained hypergraph model for sparse
                 matrix-matrix multiplication (SpGEMM), a key
                 computational kernel in scientific computing and data
                 analysis whose performance is often communication
                 bound. This model correctly describes both the
                 interprocessor communication volume along a critical
                 path in a parallel computation and also the volume of
                 data moving through the memory hierarchy in a
                 sequential computation. We show that identifying a
                 communication-optimal algorithm for particular input
                 matrices is equivalent to solving a hypergraph
                 partitioning problem. Our approach is nonzero structure
                 dependent, meaning that we seek the best algorithm for
                 the given input matrices. In addition to our
                 three-dimensional fine-grained model, we also propose
                 coarse-grained one-dimensional and two-dimensional
                 models that correspond to simpler SpGEMM algorithms. We
                 explore the relations between our models theoretically,
                 and we study their performance experimentally in the
                 context of three applications that use SpGEMM as a key
                 computation. For each application, we find that at
                 least one coarse-grained model is as communication
                 efficient as the fine-grained model. We also observe
                 that different applications have affinities for
                 different algorithms. Our results demonstrate that
                 hypergraphs are an accurate model for reasoning about
                 the communication costs of SpGEMM as well as a
                 practical tool for exploring the SpGEMM algorithm
                 design space.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Grove:2017:ISS,
  author =       "David Grove",
  title =        "Introduction to the Special Section on {PPoPP'15}",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040224",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Majo:2017:LPC,
  author =       "Zoltan Majo and Thomas R. Gross",
  title =        "A Library for Portable and Composable Data Locality
                 Optimizations for {NUMA} Systems",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040222",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Many recent multiprocessor systems are realized with a
                 nonuniform memory architecture (NUMA) and accesses to
                 remote memory locations take more time than local
                 memory accesses. Optimizing NUMA memory system
                 performance is difficult and costly for three principal
                 reasons: (1) Today's programming languages/libraries
                 have no explicit support for NUMA systems, (2) NUMA
                 optimizations are not portable, and (3) optimizations
                 are not composable (i.e., they can become ineffective
                 or worsen performance in environments that support
                 composable parallel software). This article presents
                 TBB-NUMA, a parallel programming library based on Intel
                 Threading Building Blocks (TBB) that supports portable
                 and composable NUMA-aware programming. TBB-NUMA
                 provides a model of task affinity that captures a
                 programmer's insights on mapping tasks to resources.
                 NUMA-awareness affects all layers of the library (i.e.,
                 resource management, task scheduling, and high-level
                 parallel algorithm templates) and requires close
                 coupling between all these layers. Optimizations
                 implemented with TBB-NUMA (for a set of standard
                 benchmark programs) result in up to 44\% performance
                 improvement over standard TBB. But more important,
                 optimized programs are portable across different NUMA
                 architectures and preserve data locality also when
                 composed with other parallel computations sharing the
                 same resource management layer.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Golan-Gueta:2017:ASA,
  author =       "Guy Golan-Gueta and G. Ramalingam and Mooly Sagiv and
                 Eran Yahav",
  title =        "Automatic Scalable Atomicity via Semantic Locking",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040223",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this article, we consider concurrent programs in
                 which the shared state consists of instances of
                 linearizable abstract data types (ADTs). We present an
                 automated approach to concurrency control that
                 addresses a common need: the need to atomically execute
                 a code fragment, which may contain multiple ADT
                 operations on multiple ADT instances. We present a
                 synthesis algorithm that automatically enforces
                 atomicity of given code fragments (in a client program)
                 by inserting pessimistic synchronization that
                 guarantees atomicity and deadlock-freedom (without
                 using any rollback mechanism). Our algorithm takes a
                 commutativity specification as an extra input. This
                 specification indicates for every pair of ADT
                 operations the conditions under which the operations
                 commute. Our algorithm enables greater parallelism by
                 permitting commuting operations to execute
                 concurrently. We have implemented the synthesis
                 algorithm in a Java compiler and applied it to several
                 Java programs. Our results show that our approach
                 produces efficient and scalable synchronization.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Izraelevitz:2017:GSN,
  author =       "Joseph Izraelevitz and Michael L. Scott",
  title =        "Generality and Speed in Nonblocking Dual Containers",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "22:1--22:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040220",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Nonblocking dual data structures extend traditional
                 notions of nonblocking progress to accommodate partial
                 methods, both by bounding the number of steps that a
                 thread can execute after its preconditions have been
                 satisfied and by ensuring that a waiting thread
                 performs no remote memory accesses that could interfere
                 with the execution of other threads. A nonblocking dual
                 container, in particular, is designed to hold either
                 data or requests. An insert operation either adds data
                 to the container or removes and satisfies a request; a
                 remove operation either takes data out of the container
                 or inserts a request. We present the first
                 general-purpose construction for nonblocking dual
                 containers, allowing any nonblocking container for data
                 to be paired with almost any nonblocking container for
                 requests. We also present new custom algorithms, based
                 on the LCRQ of Morrison and Afek, that outperform the
                 fastest previously known dual containers by factors of
                 four to six.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Cole:2017:ROS,
  author =       "Richard Cole and Vijaya Ramachandran",
  title =        "Resource Oblivious Sorting on Multicores",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "23:1--23:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040221",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present a deterministic sorting algorithm, Sample,
                 Partition, and Merge Sort (SPMS), that interleaves the
                 partitioning of a sample sort with merging.
                 Sequentially, it sorts $n$ elements in $ O(n \log n)$
                 time cache-obliviously with an optimal number of cache
                 misses. The parallel complexity (or critical path
                 length) of the algorithm is $ O(\log n \log \log n)$,
                 which improves on previous bounds for deterministic
                 sample sort. The algorithm also has low false sharing
                 costs. When scheduled by a work-stealing scheduler in a
                 multicore computing environment with a global shared
                 memory and p cores, each having a cache of size $M$
                 organized in blocks of size $B$, the costs of the
                 additional cache misses and false sharing misses due to
                 this parallel execution are bounded by the cost of $
                 O(S \cdot M / B)$ and $ O(S \cdot B)$ cache misses,
                 respectively, where $S$ is the number of steals
                 performed during the execution. Finally, SPMS is
                 resource oblivious in that the dependence on machine
                 parameters appear only in the analysis of its
                 performance and not within the algorithm itself.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2017:GEIa,
  author =       "Grey Ballard and Mary Hall and Tim Harris and Brandon
                 Lucia",
  title =        "{Guest Editor} Introduction {PPoPP 2016}, Special
                 Issue 2 of 2",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108141",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ashkiani:2017:GME,
  author =       "Saman Ashkiani and Andrew Davidson and Ulrich Meyer
                 and John D. Owens",
  title =        "{GPU Multisplit}: an Extended Study of a Parallel
                 Algorithm",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108139",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Multisplit is a broadly useful parallel primitive that
                 permutes its input data into contiguous buckets or
                 bins, where the function that categorizes an element
                 into a bucket is provided by the programmer. Due to the
                 lack of an efficient multisplit on Graphics Processing
                 Units (GPUs), programmers often choose to implement
                 multisplit with a sort. One way is to first generate an
                 auxiliary array of bucket IDs and then sort input data
                 based on it. In case smaller indexed buckets possess
                 smaller valued keys, another way for multisplit is to
                 directly sort input data. Both methods are inefficient
                 and require more work than necessary: The former
                 requires more expensive data movements while the latter
                 spends unnecessary effort in sorting elements within
                 each bucket. In this work, we provide a parallel model
                 and multiple implementations for the multisplit
                 problem. Our principal focus is multisplit for a small
                 (up to 256) number of buckets. We use warp-synchronous
                 programming models and emphasize warpwide
                 communications to avoid branch divergence and reduce
                 memory usage. We also hierarchically reorder input
                 elements to achieve better coalescing of global memory
                 accesses. On a GeForce GTX 1080 GPU, we can reach a
                 peak throughput of 18.93Gkeys/s (or 11.68Gpairs/s) for
                 a key-only (or key-value) multisplit. Finally, we
                 demonstrate how multisplit can be used as a building
                 block for radix sort. In our multisplit-based sort
                 implementation, we achieve comparable performance to
                 the fastest GPU sort routines, sorting 32-bit keys (and
                 key-value pairs) with a throughput of 3.0Gkeys/s (and
                 2.1Gpair/s).",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Wang:2017:GGG,
  author =       "Yangzihao Wang and Yuechao Pan and Andrew Davidson and
                 Yuduo Wu and Carl Yang and Leyuan Wang and Muhammad
                 Osama and Chenshan Yuan and Weitang Liu and Andy T.
                 Riffel and John D. Owens",
  title =        "{Gunrock}: {GPU} Graph Analytics",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108140",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "For large-scale graph analytics on the GPU, the
                 irregularity of data access and control flow, and the
                 complexity of programming GPUs, have presented two
                 significant challenges to developing a programmable
                 high-performance graph library. ``Gunrock,'' our
                 graph-processing system designed specifically for the
                 GPU, uses a high-level, bulk-synchronous, data-centric
                 abstraction focused on operations on a vertex or edge
                 frontier. Gunrock achieves a balance between
                 performance and expressiveness by coupling
                 high-performance GPU computing primitives and
                 optimization strategies with a high-level programming
                 model that allows programmers to quickly develop new
                 graph primitives with small code size and minimal GPU
                 programming knowledge. We characterize the performance
                 of various optimization strategies and evaluate
                 Gunrock's overall performance on different GPU
                 architectures on a wide range of graph primitives that
                 span from traversal-based algorithms and ranking
                 algorithms, to triangle counting and
                 bipartite-graph-based algorithms. The results show that
                 on a single GPU, Gunrock has on average at least an
                 order of magnitude speedup over Boost and PowerGraph,
                 comparable performance to the fastest GPU hardwired
                 primitives and CPU shared-memory graph libraries, such
                 as Ligra and Galois, and better performance than any
                 other GPU high-level graph library.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Chowdhury:2017:AAD,
  author =       "Rezaul Chowdhury and Pramod Ganapathi and Stephen
                 Tschudi and Jesmin Jahan Tithi and Charles Bachmeier
                 and Charles E. Leiserson and Armando Solar-Lezama and
                 Bradley C. Kuszmaul and Yuan Tang",
  title =        "{Autogen}: Automatic Discovery of Efficient Recursive
                 Divide-\&-Conquer Algorithms for Solving Dynamic
                 Programming Problems",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3125632",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present Autogen --- an algorithm that for a wide
                 class of dynamic programming (DP) problems
                 automatically discovers highly efficient
                 cache-oblivious parallel recursive divide-and-conquer
                 algorithms from inefficient iterative descriptions of
                 DP recurrences. Autogen analyzes the set of DP table
                 locations accessed by the iterative algorithm when run
                 on a DP table of small size and automatically
                 identifies a recursive access pattern and a
                 corresponding provably correct recursive algorithm for
                 solving the DP recurrence. We use Autogen to
                 autodiscover efficient algorithms for several
                 well-known problems. Our experimental results show that
                 several autodiscovered algorithms significantly
                 outperform parallel looping and tiled loop-based
                 algorithms. Also, these algorithms are less sensitive
                 to fluctuations of memory and bandwidth compared with
                 their looping counterparts, and their running times and
                 energy profiles remain relatively more stable. To the
                 best of our knowledge, Autogen is the first algorithm
                 that can automatically discover new nontrivial
                 divide-and-conquer algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Steele:2017:AAC,
  author =       "Guy L. {Steele Jr.} and Jean-Baptiste Tristan",
  title =        "Adding Approximate Counters",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132167",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We describe a general framework for adding the values
                 of two approximate counters to produce a new
                 approximate counter value whose expected estimated
                 value is equal to the sum of the expected estimated
                 values of the given approximate counters. (To the best
                 of our knowledge, this is the first published
                 description of any algorithm for adding two approximate
                 counters.) We then work out implementation details for
                 five different kinds of approximate counter and provide
                 optimized pseudocode. For three of them, we present
                 proofs that the variance of a counter value produced by
                 adding two counter values in this way is bounded, and
                 in fact is no worse, or not much worse, than the
                 variance of the value of a single counter to which the
                 same total number of increment operations have been
                 applied. Addition of approximate counters is useful in
                 massively parallel divide-and-conquer algorithms that
                 use a distributed representation for large arrays of
                 counters. We describe two machine-learning algorithms
                 for topic modeling that use millions of integer
                 counters and confirm that replacing the integer
                 counters with approximate counters is effective,
                 speeding up a GPU-based implementation by over 65\% and
                 a CPU-based implementation by nearly 50\%, as well as
                 reducing memory requirements, without degrading their
                 statistical effectiveness.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2017:GEIb,
  author =       "Grey Ballard and Mary Hall and Tim Harris and Brandon
                 Lucia",
  title =        "{Guest Editor} Introduction {PPoPP 2016}, Special
                 Issue 2 of 2",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "6:1--6:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108142",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kalikar:2017:DNM,
  author =       "Saurabh Kalikar and Rupesh Nasre",
  title =        "{DomLock}: a New Multi-Granularity Locking Technique
                 for Hierarchies",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "7:1--7:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127584",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present efficient locking mechanisms for
                 hierarchical data structures. Several applications work
                 on an abstract hierarchy of objects, and a parallel
                 execution on this hierarchy necessitates
                 synchronization across workers operating on different
                 parts of the hierarchy. Existing synchronization
                 mechanisms are too coarse, too inefficient, or too ad
                 hoc, resulting in reduced or unpredictable amount of
                 concurrency. We propose a new locking approach based on
                 the structural properties of the underlying hierarchy.
                 We show that the developed techniques are efficient
                 even when the hierarchy is an arbitrary graph.
                 Theoretically, we present our approach as a
                 locking-cost-minimizing instance of a generic algebraic
                 model of synchronization for hierarchies. Using
                 STMBench7, we illustrate considerable reduction in the
                 locking cost, resulting in an average throughput
                 improvement of 42\%.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Haider:2017:LRA,
  author =       "Syed Kamran Haider and William Hasenplaugh and Dan
                 Alistarh",
  title =        "{Lease\slash Release}: Architectural Support for
                 Scaling Contended Data Structures",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132168",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "High memory contention is generally agreed to be a
                 worst-case scenario for concurrent data structures.
                 There has been a significant amount of research effort
                 spent investigating designs that minimize contention,
                 and several programming techniques have been proposed
                 to mitigate its effects. However, there are currently
                 few architectural mechanisms to allow scaling contended
                 data structures at high thread counts. In this article,
                 we investigate hardware support for scalable contended
                 data structures. We propose Lease/Release, a simple
                 addition to standard directory-based MESI cache
                 coherence protocols, allowing participants to lease
                 memory, at the granularity of cache lines, by delaying
                 coherence messages for a short, bounded period of time.
                 Our analysis shows that Lease/Release can significantly
                 reduce the overheads of contention for both
                 non-blocking (lock-free) and lock-based data structure
                 implementations while ensuring that no deadlocks are
                 introduced. We validate Lease/Release empirically on
                 the Graphite multiprocessor simulator on a range of
                 data structures, including queue, stack, and priority
                 queue implementations, as well as on transactional
                 applications. Results show that Lease/Release
                 consistently improves both throughput and energy usage,
                 by up to 5x, both for lock-free and lock-based data
                 structure designs.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Cao:2017:HRD,
  author =       "Man Cao and Minjia Zhang and Aritra Sengupta and
                 Swarnendu Biswas and Michael D. Bond",
  title =        "Hybridizing and Relaxing Dependence Tracking for
                 Efficient Parallel Runtime Support",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108138",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "It is notoriously challenging to develop parallel
                 software systems that are both scalable and correct.
                 Runtime support for parallelism-such as multithreaded
                 record and replay, data race detectors, transactional
                 memory, and enforcement of stronger memory models-helps
                 achieve these goals, but existing commodity solutions
                 slow programs substantially to track (i.e., detect or
                 control) an execution's cross-thread dependencies
                 accurately. Prior work tracks cross-thread dependencies
                 either ``pessimistically,'' slowing every program
                 access, or ``optimistically,'' allowing for lightweight
                 instrumentation of most accesses but dramatically
                 slowing accesses that are conflicting (i.e., involved
                 in cross-thread dependencies). This article presents
                 two novel approaches that seek to improve the
                 performance of dependence tracking. Hybrid tracking
                 (HT) hybridizes pessimistic and optimistic tracking by
                 overcoming a fundamental mismatch between these two
                 kinds of tracking. HT uses an adaptive, profile-based
                 policy to make runtime decisions about switching
                 between pessimistic and optimistic tracking. Relaxed
                 tracking (RT) attempts to reduce optimistic tracking's
                 overhead on conflicting accesses by tracking
                 dependencies in a ``relaxed'' way-meaning that not all
                 dependencies are tracked accurately-while still
                 preserving both program semantics and runtime support's
                 correctness. To demonstrate the usefulness and
                 potential of HT and RT, we build runtime support based
                 on the two approaches. Our evaluation shows that both
                 approaches offer performance advantages over existing
                 approaches, but there exist challenges and
                 opportunities for further improvement. HT and RT are
                 distinct solutions to the same problem. It is easier to
                 build runtime support based on HT than on RT, although
                 RT does not incur the overhead of online profiling.
                 This article presents the two approaches together to
                 inform and inspire future designs for efficient
                 parallel runtime support.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Chatzopoulos:2017:EES,
  author =       "Georgios Chatzopoulos and Aleksandar Dragojevi{\'c}
                 and Rachid Guerraoui",
  title =        "{ESTIMA}: Extrapolating {ScalabiliTy} of In-Memory
                 Applications",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108137",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "This article presents estima, an easy-to-use tool for
                 extrapolating the scalability of in-memory
                 applications. estima is designed to perform a simple
                 yet important task: Given the performance of an
                 application on a small machine with a handful of cores,
                 estima extrapolates its scalability to a larger machine
                 with more cores, while requiring minimum input from the
                 user. The key idea underlying estima is the use of
                 stalled cycles (e.g., cycles that the processor spends
                 waiting for missed cache line fetches or busy locks).
                 estima measures stalled cycles on a few cores and
                 extrapolates them to more cores, estimating the amount
                 of waiting in the system. estima can be effectively
                 used to predict the scalability of in-memory
                 applications for bigger execution machines. For
                 instance, using measurements of memcached and SQLite on
                 a desktop machine, we obtain accurate predictions of
                 their scalability on a server. Our extensive evaluation
                 shows the effectiveness of estima on a large number of
                 in-memory benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gulisano:2017:EDS,
  author =       "Vincenzo Gulisano and Yiannis Nikolakopoulos and
                 Daniel Cederman and Marina Papatriantafilou and
                 Philippas Tsigas",
  title =        "Efficient Data Streaming Multiway Aggregation through
                 Concurrent Algorithmic Designs and New Abstract Data
                 Types",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131272",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Data streaming relies on continuous queries to process
                 unbounded streams of data in a real-time fashion. It is
                 commonly demanding in computation capacity, given that
                 the relevant applications involve very large volumes of
                 data. Data structures act as articulation points and
                 maintain the state of data streaming operators,
                 potentially supporting high parallelism and balancing
                 the work among them. Prompted by this fact, in this
                 work we study and analyze parallelization needs of
                 these articulation points, focusing on the problem of
                 streaming multiway aggregation, where large data
                 volumes are received from multiple input streams. The
                 analysis of the parallelization needs, as well as of
                 the use and limitations of existing aggregate designs
                 and their data structures, leads us to identify needs
                 for appropriate shared objects that can achieve
                 low-latency and high-throughput multiway aggregation.
                 We present the requirements of such objects as abstract
                 data types and we provide efficient lock-free
                 linearizable algorithmic implementations of them, along
                 with new multiway aggregate algorithmic designs that
                 leverage them, supporting both deterministic
                 order-sensitive and order-insensitive aggregate
                 functions. Furthermore, we point out future directions
                 that open through these contributions. The article
                 includes an extensive experimental study, based on a
                 variety of continuous aggregation queries on two large
                 datasets extracted from SoundCloud, a music social
                 network, and from a Smart Grid network. In all the
                 experiments, the proposed data structures and the
                 enhanced aggregate operators improved the processing
                 performance significantly, up to one order of
                 magnitude, in terms of both throughput and latency,
                 over the commonly used techniques based on queues.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Malas:2018:MIP,
  author =       "Tareq M. Malas and Georg Hager and Hatem Ltaief and
                 David E. Keyes",
  title =        "Multidimensional Intratile Parallelization for
                 Memory-Starved Stencil Computations",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "12:1--12:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155290",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Optimizing the performance of stencil algorithms has
                 been the subject of intense research over the last two
                 decades. Since many stencil schemes have low arithmetic
                 intensity, most optimizations focus on increasing the
                 temporal data access locality, thus reducing the data
                 traffic through the main memory interface with the
                 ultimate goal of decoupling from this bottleneck. There
                 are, however, only a few approaches that explicitly
                 leverage the shared cache feature of modern multicore
                 chips. If every thread works on its private, separate
                 cache block, the available cache space can become too
                 small, and sufficient temporal locality may not be
                 achieved. We propose a flexible multidimensional
                 intratile parallelization method for stencil algorithms
                 on multicore CPUs with a shared outer-level cache. This
                 method leads to a significant reduction in the required
                 cache space without adverse effects from hardware
                 prefetching or TLB shortage. Our Girih framework
                 includes an autotuner to select optimal parameter
                 configurations on the target hardware. We conduct
                 performance experiments on two contemporary Intel
                 processors and compare with the state-of-the-art
                 stencil frameworks Pluto and Pochoir, using four
                 corner-case stencil schemes and a wide range of problem
                 sizes. Girih shows substantial performance advantages
                 and best arithmetic intensity at almost all problem
                 sizes, especially on low-intensity stencils with
                 variable coefficients. We study in detail the
                 performance behavior at varying grid sizes using
                 phenomenological performance modeling. Our analysis of
                 energy consumption reveals that our method can save
                 energy through reduced DRAM bandwidth usage even at a
                 marginal performance gain. It is thus well suited for
                 future architectures that will be strongly challenged
                 by the cost of data movement, be it in terms of
                 performance or energy consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Akbudak:2018:PMS,
  author =       "Kadir Akbudak and Oguz Selvitopi and Cevdet Aykanat",
  title =        "Partitioning Models for Scaling Parallel Sparse
                 Matrix--Matrix Multiplication",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155292",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We investigate outer-product--parallel,
                 inner-product--parallel, and
                 row-by-row-product--parallel formulations of sparse
                 matrix-matrix multiplication (SpGEMM) on distributed
                 memory architectures. For each of these three
                 formulations, we propose a hypergraph model and a
                 bipartite graph model for distributing SpGEMM
                 computations based on one-dimensional (1D) partitioning
                 of input matrices. We also propose a communication
                 hypergraph model for each formulation for distributing
                 communication operations. The computational graph and
                 hypergraph models adopted in the first phase aim at
                 minimizing the total message volume and balancing the
                 computational loads of processors, whereas the
                 communication hypergraph models adopted in the second
                 phase aim at minimizing the total message count and
                 balancing the message volume loads of processors. That
                 is, the computational partitioning models reduce the
                 bandwidth cost and the communication hypergraph models
                 reduce the latency cost. Our extensive parallel
                 experiments on up to 2048 processors for a wide range
                 of realistic SpGEMM instances show that although the
                 outer-product--parallel formulation scales better, the
                 row-by-row-product--parallel formulation is more viable
                 due to its significantly lower partitioning overhead
                 and competitive scalability. For computational
                 partitioning models, our experimental findings indicate
                 that the proposed bipartite graph models are attractive
                 alternatives to their hypergraph counterparts because
                 of their lower partitioning overhead. Finally, we show
                 that by reducing the latency cost besides the bandwidth
                 cost through using the communication hypergraph models,
                 the parallel SpGEMM time can be further improved up to
                 32\%.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bilardi:2018:LBT,
  author =       "Gianfranco Bilardi and Michele Scquizzato and
                 Francesco Silvestri",
  title =        "A Lower Bound Technique for Communication in {BSP}",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3181776",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Communication is a major factor determining the
                 performance of algorithms on current computing systems;
                 it is therefore valuable to provide tight lower bounds
                 on the communication complexity of computations. This
                 article presents a lower bound technique for the
                 communication complexity in the bulk-synchronous
                 parallel (BSP) model of a given class of DAG
                 computations. The derived bound is expressed in terms
                 of the switching potential of a DAG, that is, the
                 number of permutations that the DAG can realize when
                 viewed as a switching network. The proposed technique
                 yields tight lower bounds for the fast Fourier
                 transform (FFT), and for any sorting and permutation
                 network. A stronger bound is also derived for the
                 periodic balanced sorting network, by applying this
                 technique to suitable subnetworks. Finally, we
                 demonstrate that the switching potential captures
                 communication requirements even in computational models
                 different from BSP, such as the I/O model and the
                 LPRAM.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sahin:2018:CSC,
  author =       "Semih Sahin and Bugra Gedik",
  title =        "{C-Stream}: a Co-routine-Based Elastic Stream
                 Processing Engine",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3184120",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Stream processing is a computational paradigm for
                 on-the-fly processing of live data. This paradigm lends
                 itself to implementations that can provide high
                 throughput and low latency by taking advantage of
                 various forms of parallelism that are naturally
                 captured by the stream processing model of computation,
                 such as pipeline, task, and data parallelism. In this
                 article, we describe the design and implementation of
                 C-Stream, which is an elastic stream processing engine.
                 C-Stream encompasses three unique properties. First, in
                 contrast to the widely adopted event-based interface
                 for developing streaming operators, C-Stream provides
                 an interface wherein each operator has its own driver
                 loop and relies on data availability application
                 programming interfaces (APIs) to decide when to perform
                 its computations. This self-control-based model
                 significantly simplifies the development of operators
                 that require multiport synchronization. Second,
                 C-Stream contains a dynamic scheduler that manages the
                 multithreaded execution of the operators. The
                 scheduler, which is customizable via plug-ins, enables
                 the execution of the operators as co-routines, using
                 any number of threads. The base scheduler implements
                 back-pressure, provides data availability APIs, and
                 manages preemption and termination handling. Last,
                 C-Stream varies the degree of parallelism to resolve
                 bottlenecks by both dynamically changing the number of
                 threads used to execute an application and adjusting
                 the number of replicas of data-parallel operators. We
                 provide an experimental evaluation of C-Stream. The
                 results show that C-Stream is scalable, highly
                 customizable, and can resolve bottlenecks by
                 dynamically adjusting the level of data parallelism
                 used.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Agrawal:2018:ISI,
  author =       "Kunal Agrawal and I-Ting Angelina Lee and Michael
                 Spear",
  title =        "Introduction to Special Issue on {SPAA'15}",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226041",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ahn:2018:ADN,
  author =       "Kook Jin Ahn and Sudipto Guha",
  title =        "Access to Data and Number of Iterations: Dual Primal
                 Algorithms for Maximum Matching under Resource
                 Constraints",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154855",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this article, we consider graph algorithms in
                 models of computation where the space usage (random
                 accessible storage, in addition to the read-only input)
                 is sublinear in the number of edges $m$ and the access
                 to input is constrained. These questions arise in many
                 natural settings, and in particular in the analysis of
                 streaming algorithms, MapReduce or similar algorithms,
                 or message passing distributed computing that model
                 constrained parallelism with sublinear central
                 processing. We focus on weighted nonbipartite maximum
                 matching in this article. For any constant $ p > 1$, we
                 provide an iterative sampling-based algorithm for
                 computing a $ (1 - \epsilon)$-approximation of the
                 weighted nonbipartite maximum matching that uses $ O(p
                 / \epsilon)$ rounds of sampling, and $ O(n^{1 + 1 /
                 p})$ space. The results extend to $b$-Matching with
                 small changes. This article combines adaptive sketching
                 literature and fast primal-dual algorithms based on
                 relaxed Dantzig--Wolfe decision procedures. Each round
                 of sampling is implemented through linear sketches and
                 can be executed in a single round of streaming or two
                 rounds of MapReduce. The article also proves that
                 nonstandard linear relaxations of a problem, in
                 particular penalty-based formulations, are helpful in
                 reducing the adaptive dependence of the iterations.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Alistarh:2018:TAS,
  author =       "Dan Alistarh and William Leiserson and Alexander
                 Matveev and Nir Shavit",
  title =        "{ThreadScan}: Automatic and Scalable Memory
                 Reclamation",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3201897",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The concurrent memory reclamation problem is that of
                 devising a way for a deallocating thread to verify that
                 no other concurrent threads hold references to a memory
                 block being deallocated. To date, in the absence of
                 automatic garbage collection, there is no satisfactory
                 solution to this problem; existing tracking methods
                 like hazard pointers, reference counters, or
                 epoch-based techniques like RCU are either
                 prohibitively expensive or require significant
                 programming expertise to the extent that implementing
                 them efficiently can be worthy of a publication. None
                 of the existing techniques are automatic or even
                 semi-automated. In this article, we take a new approach
                 to concurrent memory reclamation. Instead of manually
                 tracking access to memory locations as done in
                 techniques like hazard pointers, or restricting shared
                 accesses to specific epoch boundaries as in RCU, our
                 algorithm, called ThreadScan, leverages operating
                 system signaling to automatically detect which memory
                 locations are being accessed by concurrent threads.
                 Initial empirical evidence shows that ThreadScan scales
                 surprisingly well and requires negligible programming
                 effort beyond the standard use of Malloc and Free.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dimitrov:2018:RDT,
  author =       "Dimitar Dimitrov and Martin Vechev and Vivek Sarkar",
  title =        "Race Detection in Two Dimensions",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3264618",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Dynamic race detection is a program analysis technique
                 for detecting errors caused by undesired interleavings
                 of concurrent tasks. A primary challenge when designing
                 efficient race detection algorithms is to achieve
                 manageable space requirements. State-of-the-art
                 algorithms for unstructured parallelism require $
                 \Theta (n) $ space per monitored memory location, where
                 n is the total number of tasks. This is a serious
                 drawback when analyzing programs with many tasks. In
                 contrast, algorithms for programs with a
                 series-parallel (SP) structure require only $ \Theta
                 (1) $ space. Unfortunately, it is currently not well
                 understood if there are classes of parallelism beyond
                 SP that can also benefit from and be analyzed with $
                 \Theta (1) $ space complexity. In this work, we show
                 that structures richer than SP graphs, namely, that of
                 two-dimensional (2D) lattices, can also be analyzed in
                 $ \Theta (1) $ space. Toward that (a) we extend
                 Tarjan's algorithm for finding lowest common ancestors
                 to handle 2D lattices; (b) from that extension we
                 derive a serial algorithm for race detection that can
                 analyze arbitrary task graphs with a 2D lattice
                 structure; (c) we present a restriction to fork-join
                 that admits precisely the 2D lattices as task graphs
                 (e.g., it can express pipeline parallelism). Our work
                 generalizes prior work on structured race detection and
                 aims to provide a deeper understanding of the interplay
                 between structured parallelism and program analysis
                 efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Lee:2018:ERD,
  author =       "I-Ting Angelina Lee and Tao B. Schardl",
  title =        "Efficient Race Detection for Reducer Hyperobjects",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3205914",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "A multithreaded Cilk program that is ostensibly
                 deterministic may nevertheless behave
                 nondeterministically due to programming errors in the
                 code. For a Cilk program that uses reducers-a general
                 reduction mechanism supported in various Cilk
                 dialects-such programming errors are especially
                 challenging to debug, because the errors can expose the
                 nondeterminism in how the Cilk runtime system manages
                 reducers. We identify two unique types of races that
                 arise from incorrect use of reducers in a Cilk program,
                 and we present two algorithms to catch these races. The
                 first algorithm, called the Peer-Set algorithm, detects
                 view-read races, which occur when the program attempts
                 to retrieve a value out of a reducer when the read may
                 result in a nondeterministic value, such as before all
                 previously spawned subcomputations that might update
                 the reducer have necessarily returned. The second
                 algorithm, called the SP+ algorithm, detects
                 determinacy races-instances where a write to a memory
                 location occurs logically in parallel with another
                 access to that location-even when the raced-on memory
                 locations relate to reducers. Both algorithms are
                 provably correct, asymptotically efficient, and can be
                 implemented efficiently in practice. We have
                 implemented both algorithms in our prototype race
                 detector, Rader. When running Peer-Set, Rader incurs a
                 geometric-mean multiplicative overhead of 2.56 over
                 running the benchmark without instrumentation. When
                 running SP+, Rader incurs a geometric-mean
                 multiplicative overhead of 16.94.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gilbert:2018:ISI,
  author =       "Seth Gilbert",
  title =        "Introduction to the Special Issue for {SPAA 2016}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3230677",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Mitzenmacher:2018:BBC,
  author =       "Michael Mitzenmacher and Rajmohan Rajaraman and Scott
                 Roche",
  title =        "Better Bounds for Coalescing-Branching Random Walks",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209688",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Coalescing-branching random walks, or cobra walks for
                 short, are a natural variant of random walks on graphs
                 that can model the spread of disease through contacts
                 or the spread of information in networks. In a k -cobra
                 walk, at each timestep, a subset of the vertices are
                 active; each active vertex chooses k random neighbors
                 (sampled independently and uniformly with replacement)
                 that become active at the next step, and these are the
                 only active vertices at the next step. A natural
                 quantity to study for cobra walks is the cover time,
                 which corresponds to the expected time when all nodes
                 have become infected or received the disseminated
                 information. In this article, we extend previous
                 results for cobra walks in multiple ways. We show that
                 the cover time for the 2-cobra walk on $ [0, n]^d $ is
                 $ O(n) $ (where the order notation hides constant
                 factors that depend on $d$); previous work had shown
                 the cover time was $ O(n \cdot \polylog (n))$. We show
                 that the cover time for a 2-cobra walk on an $n$-vertex
                 $d$ regular graph with conductance $ \phi_G$ is $ O(d^4
                 \phis^{-2}_G \log^2 n)$, significantly generalizing a
                 previous result that held only for expander graphs with
                 sufficiently high expansion. And, finally, we show that
                 the cover time for a 2-cobra walk on a graph with n
                 vertices and m edges is always $ O(m n^{3 / 4} \log
                 n)$; this is the first result showing that the bound of
                 $ \Theta (n^3)$ for the worst-case cover time for
                 random walks can be beaten using 2-cobra walks.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Liu:2018:RAN,
  author =       "Mingmou Liu and Xiaoyin Pan and Yitong Yin",
  title =        "Randomized Approximate Nearest Neighbor Search with
                 Limited Adaptivity",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209884",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study the complexity of parallel data structures
                 for approximate nearest neighbor search in
                 $d$-dimensional Hamming space $ \{ 0, 1 \}^d$. A
                 classic model for static data structures is the
                 cell-probe model [27]. We consider a cell-probe model
                 with limited adaptivity, where given a $ k \geq 1$, a
                 query is resolved by making at most $k$ rounds of
                 parallel memory accesses to the data structure. We give
                 two randomized algorithms that solve the approximate
                 nearest neighbor search using $k$ rounds of parallel
                 memory accesses: --- a simple algorithm with $ O(k
                 (\log d)^{1 / k})$ total number of memory accesses for
                 all $ k \geq 1$ --- an algorithm with $ O(k + (1 / k
                 \log d)^{O(1 / k)})$ total number of memory accesses
                 for all sufficiently large $k$. Both algorithms use
                 data structures of polynomial size. We prove an $
                 \Omega (1 / k (\log d)^{1 / k})$ lower bound for the
                 total number of memory accesses for any randomized
                 algorithm solving the approximate nearest neighbor
                 search within $ k \leq \log \log d / 2 \log \log \log
                 d$ rounds of parallel memory accesses on any data
                 structures of polynomial size. This lower bound shows
                 that our first algorithm is asymptotically optimal when
                 $ k = O(1)$. And our second algorithm achieves the
                 asymptotically optimal tradeoff between number of
                 rounds and total number of memory accesses. In the
                 extremal case, when $ k = O(\log \log d / \log \log
                 \log d)$ is big enough, our second algorithm matches
                 the $ \Theta (\log \log d / \log \log \log d)$ tight
                 bound for fully adaptive algorithms for approximate
                 nearest neighbor search in [11].",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Pandurangan:2018:FDA,
  author =       "Gopal Pandurangan and Peter Robinson and Michele
                 Scquizzato",
  title =        "Fast Distributed Algorithms for Connectivity and {MST}
                 in Large Graphs",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209689",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Motivated by the increasing need to understand the
                 algorithmic foundations of distributed large-scale
                 graph computations, we study a number of fundamental
                 graph problems in a message-passing model for
                 distributed computing where $ k \geq 2 $ machines
                 jointly perform computations on graphs with $n$ nodes
                 (typically, $ n \gg k$). The input graph is assumed to
                 be initially randomly partitioned among the $k$
                 machines, a common implementation in many real-world
                 systems. Communication is point-to-point, and the goal
                 is to minimize the number of communication rounds of
                 the computation. Our main result is an (almost) optimal
                 distributed randomized algorithm for graph
                 connectivity. Our algorithm runs in $ {\~ O}(n / k^2)$
                 rounds ($ {\~ O}$ notation hides a $ \polylog (n)$
                 factor and an additive $ \polylog (n)$ term). This
                 improves over the best previously known bound of $ {\~
                 O}(n / k)$ [Klauck et al., SODA 2015] and is optimal
                 (up to a polylogarithmic factor) in light of an
                 existing lower bound of $ \Omega^~(n / k^2)$. Our
                 improved algorithm uses a bunch of techniques,
                 including linear graph sketching, that prove useful in
                 the design of efficient distributed graph algorithms.
                 Using the connectivity algorithm as a building block,
                 we then present fast randomized algorithms for
                 computing minimum spanning trees, (approximate)
                 min-cuts, and for many graph verification problems. All
                 these algorithms take $ {\~ O}(n / k^2)$ rounds and are
                 optimal up to polylogarithmic factors. We also show an
                 almost matching lower bound of $ \Omega^~(n / k^2)$
                 rounds for many graph verification problems by
                 leveraging lower bounds in random-partition
                 communication complexity.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Korupolu:2018:RPF,
  author =       "Madhukar Korupolu and Rajmohan Rajaraman",
  title =        "Robust and Probabilistic Failure-Aware Placement",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3210367",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Motivated by the growing complexity and heterogeneity
                 of modern data centers, and the prevalence of commodity
                 component failures, this article studies the
                 failure-aware placement problem of placing tasks of a
                 parallel job on machines in the data center with the
                 goal of increasing availability. We consider two models
                 of failures: adversarial and probabilistic. In the
                 adversarial model, each node has a weight (higher
                 weight implying higher reliability) and the adversary
                 can remove any subset of nodes of total weight at most
                 a given bound W and our goal is to find a placement
                 that incurs the least disruption against such an
                 adversary. In the probabilistic model, each node has a
                 probability of failure and we need to find a placement
                 that maximizes the probability that at least K out of N
                 tasks survive at any time. For adversarial failures, we
                 first show that (i) the problems are in $ \Sigma_2 $,
                 the second level of the polynomial hierarchy; (ii) a
                 variant of the problem that we call RobustFap (for
                 Robust Failure-Aware Placement) is co-NP-hard; and
                 (iii) an all-or-nothing version of RobustFap is $
                 \Sigma_2$-complete. We then give a polynomial-time
                 approximation scheme (PTAS) for RobustFap, a key
                 ingredient of which is a solution that we design for a
                 fractional version of RobustFap. We then study
                 HierRobustFap, which is the fractional RobustFap
                 problem over a hierarchical network, in which failures
                 can occur at any subset of nodes in the hierarchy, and
                 a failure at a node can adversely impact all of its
                 descendants in the hierarchy. To solve HierRobustFap,
                 we introduce a notion of hierarchical max-min fairness
                 and a novel Generalized Spreading algorithm, which is
                 simultaneously optimal for every upper bound W on the
                 total weight of nodes that an adversary can fail. These
                 generalize the classical notion of max-min fairness to
                 work with nodes of differing capacities, differing
                 reliability weights, and hierarchical structures. Using
                 randomized rounding, we extend this to give an
                 algorithm for integral HierRobustFap. For the
                 probabilistic version, we first give an algorithm that
                 achieves an additive $ \epsilon $ approximation in the
                 failure probability for the single level version,
                 called ProbFap, while giving up a $ (1 + \epsilon)$
                 multiplicative factor in the number of failures. We
                 then extend the result to the hierarchical version,
                 HierProbFap, achieving an \epsilon additive
                 approximation in failure probability while giving up an
                 $ (L + \epsilon)$ multiplicative factor in the number
                 of failures, where $L$ is the number of levels in the
                 hierarchy.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Zhang:2018:LFT,
  author =       "Deli Zhang and Pierre Laborde and Lance Lebanoff and
                 Damian Dechev",
  title =        "Lock-Free Transactional Transformation for Linked Data
                 Structures",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209690",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/hash.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Nonblocking data structures allow scalable and
                 thread-safe access to shared data. They provide
                 individual operations that appear to execute
                 atomically. However, it is often desirable to execute
                 multiple operations atomically in a transactional
                 manner. Previous solutions, such as Software
                 Transactional Memory (STM) and transactional boosting,
                 manage transaction synchronization separately from the
                 underlying data structure's thread synchronization.
                 Although this reduces programming effort, it leads to
                 overhead associated with additional synchronization and
                 the need to rollback aborted transactions. In this
                 work, we present a new methodology for transforming
                 high-performance lock-free linked data structures into
                 high-performance lock-free transactional linked data
                 structures without revamping the data structures'
                 original synchronization design. Our approach leverages
                 the semantic knowledge of the data structure to
                 eliminate the overhead of false conflicts and
                 rollbacks. We encapsulate all operations, operands, and
                 transaction status in a transaction descriptor, which
                 is shared among the nodes accessed by the same
                 transaction. We coordinate threads to help finish the
                 remaining operations of delayed transactions based on
                 their transaction descriptors. When a transaction
                 fails, we recover the correct abstract state by
                 reversely interpreting the logical status of a node. We
                 also present an obstruction-free version of our
                 algorithm that can be applied to dynamic execution
                 scenarios and an example of our approach applied to a
                 hash map. In our experimental evaluation using
                 transactions with randomly generated operations, our
                 lock-free transactional data structures outperform the
                 transactional boosted ones by 70\% on average. They
                 also outperform the alternative STM-based approaches by
                 a factor of 2 to 13 across all scenarios. More
                 importantly, we achieve 4,700 to 915,000 times fewer
                 spurious aborts than the alternatives.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Muller:2018:NHP,
  author =       "Michel M{\"u}ller and Takayuki Aoki",
  title =        "New High Performance {GPGPU} Code Transformation
                 Framework Applied to Large Production Weather
                 Prediction Code",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291523",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We introduce ``Hybrid Fortran,'' a new approach that
                 allows a high-performance GPGPU port for structured
                 grid Fortran codes. This technique only requires
                 minimal changes for a CPU targeted codebase, which is a
                 significant advancement in terms of productivity. It
                 has been successfully applied to both dynamical core
                 and physical processes of ASUCA, a Japanese mesoscale
                 weather prediction model with more than 150k lines of
                 code. By means of a minimal weather application that
                 resembles ASUCA's code structure, Hybrid Fortran is
                 compared to both a performance model as well as today's
                 commonly used method, OpenACC. As a result, the Hybrid
                 Fortran implementation is shown to deliver the same or
                 better performance than OpenACC, and its performance
                 agrees with the model both on CPU and GPU. In a
                 full-scale production run, using an ASUCA grid with
                 1581 $ \times $ 1301 $ \times $ 58 cells and real-world
                 weather data in 2km resolution, 24 NVIDIA Tesla P100
                 running the Hybrid Fortran-based GPU port are shown to
                 replace more than fifty 18-core Intel Xeon Broadwell
                 E5-2695 v4 running the reference implementation-an
                 achievement comparable to more invasive GPGPU rewrites
                 of other weather models.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Burtscher:2018:HQF,
  author =       "Martin Burtscher and Sindhu Devale and Sahar Azimi and
                 Jayadharini Jaiganesh and Evan Powers",
  title =        "A High-Quality and Fast Maximal Independent Set
                 Implementation for {GPUs}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291525",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Computing a maximal independent set is an important
                 step in many parallel graph algorithms. This article
                 introduces ECL-MIS, a maximal independent set
                 implementation that works well on GPUs. It includes key
                 optimizations to speed up computation, reduce the
                 memory footprint, and increase the set size. Its CUDA
                 implementation requires fewer than 30 kernel
                 statements, runs asynchronously, and produces a
                 deterministic result. It outperforms the maximal
                 independent set implementations of Pannotia, CUSP, and
                 IrGL on each of the 16 tested graphs of various types
                 and sizes. On a Titan X GPU, ECL-MIS is between 3.9 and
                 100 times faster (11.5 times, on average). ECL-MIS
                 running on the GPU is also faster than the parallel CPU
                 codes Ligra, Ligra+, and PBBS running on 20 Xeon cores,
                 which it outperforms by 4.1 times, on average. At the
                 same time, ECL-MIS produces maximal independent sets
                 that are up to 52\% larger (over 10\%, on average)
                 compared to these preexisting CPU and GPU
                 implementations. Whereas these codes produce maximal
                 independent sets that are, on average, about 15\%
                 smaller than the largest possible such sets, ECL-MIS
                 sets are less than 6\% smaller than the maximum
                 independent sets.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Liu:2018:APR,
  author =       "Junhong Liu and Guangming Tan and Yulong Luo and
                 Jiajia Li and Zeyao Mo and Ninghui Sun",
  title =        "An Autotuning Protocol to Rapidly Build Autotuners",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291527",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Automatic performance tuning (Autotuning) is an
                 increasingly critical tuning technique for the high
                 portable performance of Exascale applications. However,
                 constructing an autotuner from scratch remains a
                 challenge, even for domain experts. In this work, we
                 propose a performance tuning and knowledge management
                 suite (PAK) to help rapidly build autotuners. In order
                 to accommodate existing autotuning techniques, we
                 present an autotuning protocol that is composed of an
                 extractor, producer, optimizer, evaluator, and learner.
                 To achieve modularity and reusability, we also define
                 programming interfaces for each protocol component as
                 the fundamental infrastructure, which provides a
                 customizable mechanism to deploy knowledge mining in
                 the performance database. PAK's usability is
                 demonstrated by studying two important computational
                 kernels: stencil computation and sparse matrix-vector
                 multiplication (SpMV). Our proposed autotuner based on
                 PAK shows comparable performance and higher
                 productivity than traditional autotuners by writing
                 just a few tens of code using our autotuning
                 protocol.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Anta:2018:SDP,
  author =       "Antonio Fern{\'a}ndez Anta and Dariusz R. Kowalski and
                 Miguel A. Mosteiro and Prudence W. H. Wong",
  title =        "Scheduling Dynamic Parallel Workload of Mobile Devices
                 with Access Guarantees",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291529",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study a dynamic resource-allocation problem that
                 arises in various parallel computing scenarios, such as
                 mobile cloud computing, cloud computing systems,
                 Internet of Things systems, and others. Generically, we
                 model the architecture as client mobile devices and
                 static base stations. Each client ``arrives'' to the
                 system to upload data to base stations by radio
                 transmissions and then ``leaves.'' The problem, called
                 Station Assignment, is to assign clients to stations so
                 that every client uploads their data under some
                 restrictions, including a target subset of stations, a
                 maximum delay between transmissions, a volume of data
                 to upload, and a maximum bandwidth for each station. We
                 study the solvability of Station Assignment under an
                 adversary that controls the arrival and departure of
                 clients, limited to maximum rate and burstiness of such
                 arrivals. We show upper and lower bounds on the rate
                 and burstiness for various client arrival schedules and
                 protocol classes. To the best of our knowledge, this is
                 the first time that Station Assignment is studied under
                 adversarial arrivals and departures.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Mirhosseini:2018:BBA,
  author =       "Amirhossein Mirhosseini and Mohammad Sadrosadati and
                 Fatemeh Aghamohammadi and Mehdi Modarressi and Hamid
                 Sarbazi-Azad",
  title =        "{BARAN}: Bimodal Adaptive Reconfigurable-Allocator
                 Network-on-Chip",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3294049",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Virtual channels are employed to improve the
                 throughput under high traffic loads in
                 Networks-on-Chips (NoCs). However, they can impose
                 non-negligible overheads on performance by prolonging
                 clock cycle time, especially under low traffic loads
                 where the impact of virtual channels on performance is
                 trivial. In this article, we propose a novel
                 architecture, called BARAN, that can either improve
                 on-chip network performance or reduce its power
                 consumption (depending on the specific implementation
                 chosen), not both at the same time, when virtual
                 channels are underutilized; that is, the average number
                 of virtual channel allocation requests per cycle is
                 lower than the number of total virtual channels. We
                 also introduce a reconfigurable arbitration logic
                 within the BARAN architecture that can be configured to
                 have multiple latencies and, hence, multiple slack
                 times. The increased slack times are then used to
                 reduce the supply voltage of the routers or increase
                 their clock frequency in order to reduce power
                 consumption or improve the performance of the whole NoC
                 system. The power-centric design of BARAN reduces NoC
                 power consumption by 43.4\% and 40.6\% under CMP and
                 GPU workloads, on average, respectively, compared to a
                 baseline architecture while imposing negligible area
                 and performance overheads. The performance-centric
                 design of BARAN reduces the average packet latency by
                 45.4\% and 42.1\%, on average, under CMP and GPU
                 workloads, respectively, compared to the baseline
                 architecture while increasing power consumption by
                 39.7\% and 43.7\%, on average. Moreover, the
                 performance-centric BARAN postpones the network
                 saturation rate by 11.5\% under uniform random traffic
                 compared to the baseline architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Amer:2018:LCM,
  author =       "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and
                 Milind Chabbi and Yanjie Wei and Jeff Hammond and
                 Satoshi Matsuoka",
  title =        "Lock Contention Management in Multithreaded {MPI}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3275443",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3275443",
  abstract =     "In this article, we investigate contention management
                 in lock-based thread-safe MPI libraries. Specifically,
                 we make two assumptions: (1) locks are the only form of
                 synchronization when protecting communication paths;
                 and (2) contention occurs, and thus serialization is
                 unavoidable. Our work distinguishes between lock
                 acquisitions with respect to work being performed
                 inside a critical section; productive vs. unproductive.
                 Waiting for message reception without doing anything
                 else inside a critical section is an example of
                 unproductive lock acquisition. We show that the
                 high-throughput nature of modern scalable locking
                 protocols translates into better communication progress
                 for throughput-intensive MPI communication but
                 negatively impacts latency-sensitive communication
                 because of overzealous unproductive lock acquisition.
                 To reduce unproductive lock acquisitions, we devised a
                 method that promotes threads with productive work using
                 a generic two-level priority locking protocol. Our
                 results show that using a high-throughput protocol for
                 productive work and a fair protocol for less productive
                 code paths ensures the best tradeoff for fine-grained
                 communication, whereas a fair protocol is sufficient
                 for more coarse-grained communication. Although these
                 efforts have been rewarding, scalability degradation
                 remains significant. We discuss techniques that diverge
                 from the pure locking model and offer the potential to
                 further improve scalability.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Chen:2018:PDG,
  author =       "Rong Chen and Jiaxin Shi and Yanzhe Chen and Binyu
                 Zang and Haibing Guan and Haibo Chen",
  title =        "{PowerLyra}: Differentiated Graph Computation and
                 Partitioning on Skewed Graphs",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3298989",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Natural graphs with skewed distributions raise unique
                 challenges to distributed graph computation and
                 partitioning. Existing graph-parallel systems usually
                 use a ``one-size-fits-all'' design that uniformly
                 processes all vertices, which either suffer from
                 notable load imbalance and high contention for
                 high-degree vertices (e.g., Pregel and GraphLab) or
                 incur high communication cost and memory consumption
                 even for low-degree vertices (e.g., PowerGraph and
                 GraphX). In this article, we argue that skewed
                 distributions in natural graphs also necessitate
                 differentiated processing on high-degree and low-degree
                 vertices. We then introduce PowerLyra, a new
                 distributed graph processing system that embraces the
                 best of both worlds of existing graph-parallel systems.
                 Specifically, PowerLyra uses centralized computation
                 for low-degree vertices to avoid frequent
                 communications and distributes the computation for
                 high-degree vertices to balance workloads. PowerLyra
                 further provides an efficient hybrid graph partitioning
                 algorithm (i.e., hybrid-cut) that combines edge-cut
                 (for low-degree vertices) and vertex-cut (for
                 high-degree vertices) with heuristics. To improve cache
                 locality of inter-node graph accesses, PowerLyra
                 further provides a locality-conscious data layout
                 optimization. PowerLyra is implemented based on the
                 latest GraphLab and can seamlessly support various
                 graph algorithms running in both synchronous and
                 asynchronous execution modes. A detailed evaluation on
                 three clusters using various graph-analytics and MLDM
                 (Machine Learning and Data Mining) applications shows
                 that PowerLyra outperforms PowerGraph by up to 5.53X
                 (from 1.24X) and 3.26X (from 1.49X) for real-world and
                 synthetic graphs, respectively, and is much faster than
                 other systems like GraphX and Giraph, yet with much
                 less memory consumption. A porting of hybrid-cut to
                 GraphX further confirms the efficiency and generality
                 of PowerLyra.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Aravind:2019:GME,
  author =       "Alex Aravind and Wim H. Hesselink",
  title =        "Group Mutual Exclusion by Fetch-and-increment",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309202",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309202\&ftid=2042588\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
  abstract =     "The group mutual exclusion (GME) problem (also called
                 the room synchronization problem) arises in various
                 practical applications that require concurrent data
                 sharing. Group mutual exclusion aims to achieve
                 exclusive access to a shared resource (a shared room)
                 while facilitating concurrency among non-conflicting
                 requests. The problem is that threads with distinct
                 interests are not allowed to access the shared resource
                 concurrently, but multiple threads with same interest
                 can. In Blelloch et al. (2003), the authors presented a
                 simple solution to the room synchronization problem
                 using fetch8add (F8A) and test-and-set (T8S) atomic
                 operations. This algorithm has $O(m)$ remote memory
                 references (RMRs) in the cache coherent (CC) model,
                 where $m$ is the number of forums. In Bhatt and Huang
                 (2010), an open problem was posed: `` Is it possible to
                 design a GME algorithm with constant RMR for the CC
                 model using fetch8add instructions? '' This question is
                 partially answered in this article by presenting a
                 group mutual exclusion algorithm using
                 fetch-and-increment instructions. The algorithm is
                 simple and scalable.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Behzad:2019:OPH,
  author =       "Babak Behzad and Surendra Byna and Prabhat and Marc
                 Snir",
  title =        "Optimizing {I/O} Performance of {HPC} Applications
                 with Autotuning",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309205",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309205\&ftid=2044223\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
  abstract =     "Parallel Input output is an essential component of
                 modern high-performance computing (HPC). Obtaining good
                 I/O performance for a broad range of applications on
                 diverse HPC platforms is a major challenge, in part,
                 because of complex inter dependencies between I/O
                 middleware and hardware. The parallel file system and
                 I/O middleware layers all offer optimization parameters
                 that can, in theory, result in better I/O performance.
                 Unfortunately, the right combination of parameters is
                 highly dependent on the application, HPC platform,
                 problem size, and concurrency. Scientific application
                 developers do not have the time or expertise to take on
                 the substantial burden of identifying good parameters
                 for each problem configuration. They resort to using
                 system defaults, a choice that frequently results in
                 poor I/O performance. We expect this problem to be
                 compounded on exascale-class machines, which will
                 likely have a deeper software stack with hierarchically
                 arranged hardware resources. We present as a solution
                 to this problem an autotuning system for optimizing I/O
                 performance, I/O performance modeling, I/O tuning, and
                 I/O patterns. We demonstrate the value of this
                 framework across several HPC platforms and applications
                 at scale.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Maier:2019:CHT,
  author =       "Tobias Maier and Peter Sanders and Roman Dementiev",
  title =        "Concurrent Hash Tables: Fast and General(?)!",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309206",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/hash.bib;
                 http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309206\&ftid=2042589\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
  abstract =     "Concurrent hash tables are one of the most important
                 concurrent data structures, which are used in numerous
                 applications. For some applications, it is common that
                 hash table accesses dominate the execution time. To
                 efficiently solve these problems in parallel, we need
                 implementations that achieve speedups in highly
                 concurrent scenarios. Unfortunately, currently
                 available concurrent hashing libraries are far away
                 from this requirement, in particular, when adaptively
                 sized tables are necessary or contention on some
                 elements occurs. Our starting point for better
                 performing data structures is a fast and simple
                 lock-free concurrent hash table based on linear probing
                 that is, however, limited to word-sized key-value types
                 and does not support dynamic size adaptation. We
                 explain how to lift these limitations in a provably
                 scalable way and demonstrate that dynamic growing has a
                 performance overhead comparable to the same
                 generalization in sequential hash tables. We perform
                 extensive experiments comparing the performance of our
                 implementations with six of the most widely used
                 concurrent hash tables. Ours are considerably faster
                 than the best algorithms with similar restrictions and
                 an order of magnitude faster than the best more general
                 tables. In some extreme cases, the difference even
                 approaches four orders of magnitude. All our
                 implementations discussed in this publication can be
                 found on github [17].",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Cruz:2019:ETM,
  author =       "Eduardo H. M. Cruz and Matthias Diener and La{\'e}rcio
                 L. Pilla and Philippe O. A. Navaux",
  title =        "{EagerMap}: a Task Mapping Algorithm to Improve
                 Communication and Load Balancing in Clusters of
                 Multicore Systems",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309711",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309711\&ftid=2044225\&dwn=1\&CFID=115464021\&CFTOKEN=e83128dce764b9e9-5990FFA0-D877-BDE1-A02F1158E3AD2EED",
  abstract =     "Communication between tasks and load imbalance have
                 been identified as a major challenge for the
                 performance and energy efficiency of parallel
                 applications. A common way to improve communication is
                 to increase its locality, that is, to reduce the
                 distances of data transfers, prioritizing the usage of
                 faster and more efficient local interconnections over
                 remote ones. Regarding load imbalance, cores should
                 execute a similar amount of work. An important problem
                 to be solved in this context is how to determine an
                 optimized mapping of tasks to cluster nodes and cores
                 that increases the overall locality and load balancing.
                 In this article, we propose the EagerMap algorithm to
                 determine task mappings, which is based on a greedy
                 heuristic to match application communication patterns
                 to hardware hierarchies and which can also consider the
                 task load. Compared to previous algorithms, EagerMap is
                 faster, scales better, and supports more types of
                 computer systems, while maintaining the same or better
                 quality of the determined task mapping. EagerMap is
                 therefore an interesting choice for task mapping on a
                 variety of modern parallel architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}