%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.02",
%%%     date            = "13 May 2011",
%%%     time            = "18:06:48 MDT",
%%%     filename        = "supercomputing2003.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "46413 2188 12588 123939",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "BibTeX, bibliography, SC2003, Supercomputing
%%%                        2003",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a complete bibliography of papers
%%%                        published in the proceedings of
%%%                        Supercomputing '2003.
%%%                        The conference World-Wide Web site is
%%%                            http://www.sc-conference.org/sc2003/
%%%                        The organizers of this conference series
%%%                        maintain a World-Wide Web site at
%%%                            http://www.supercomp.org/
%%%                        where pointers to Web pages for the
%%%                        conferences from 1988 to date may be found.
%%%                        At version 1.02, the year coverage looked
%%%                        like this:
%%%                             2003 (  61)
%%%                             InProceedings:   60
%%%                             Proceedings:      1
%%%                             Total entries:   61
%%%                        In this bibliography, entries are sorted in
%%%                        order of PDF file numbers.
%%%                        The on-line electronic proceedings do not
%%%                        contain sequential page numbers, although
%%%                        there is an ISBN assigned for the
%%%                        proceedings.  A pagecount field is given with
%%%                        each entry, extracted from the PDF file: some
%%%                        of the articles lack page numbers altogether,
%%%                        others number pages 1, 2, 3, ...
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================

    "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi"

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Publishers and their addresses:

@String{pub-ACM                 = "ACM Press"}

@String{pub-ACM:adr             = "New York, NY 10036, USA"}

@String{pub-IEEE                = "IEEE Computer Society Press"}

@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300,
                                  Silver Spring, MD 20910, USA"}

%%% ====================================================================
%%% Bibliography entries.

  author =       "Hong Tang and Tao Yang",
  title =        "An Efficient Data Location Protocol for
                 Self-organizing Storage Clusters",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#0;
  abstract =     "Component additions and failures are common for
                 large-scale storage clusters in production
                 environments. To improve availability and
                 manageability, we investigate and compare data location
                 schemes for a large self-organizing storage cluster
                 that can quickly adapt to the additions or departures
                 of storage nodes. We further present an efficient
                 location scheme that differentiates between small and
                 large file blocks for reduced management overhead
                 compared to uniform strategies. In our protocol, small
                 blocks, which are typically in large quantities, are
                 placed through consistent hashing. Large blocks, much
                 fewer in practice, are placed through a usage-based
                 policy, and their locations are tracked by Bloom
                 filters. The proposed scheme results in improved
                 storage utilization even with non-uniform cluster
                 nodes. To achieve high scalability and fault
                 resilience, this protocol is fully distributed, relies
                 only on soft states, and supports data replication. We
                 demonstrate the effectiveness and efficiency of this
                 protocol through trace-driven simulation.",
  acknowledgement = ack-nhfb,

  author =       "Changxun Wu and Randal Burns",
  title =        "Handling Heterogeneity in Shared-Disk File Systems",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#1;
  abstract =     "We develop and evaluate a system for load management
                 in shared-disk file systems built on clusters of
                 heterogeneous computers. The system generalizes load
                 balancing and server provisioning. It balances file
                 metadata workload by moving file sets among cluster
                 server nodes. It also responds to changing server
                 resources that arise from failure and recovery and
                 dynamically adding or removing servers. The system is
                 adaptive and self-managing. It operates without any
                 a-priori knowledge of workload properties or the
                 capabilities of the servers. Rather, it continuously
                 tunes load placement using a technique called adaptive,
                 non-uniform (ANU) randomization. ANU randomization
                 realizes the scalability and metadata reduction
                 benefits of hash-based, randomized placement
                 techniques. It also avoids hashing's drawbacks: load
                 skew, inability to cope with heterogeneity, and lack of
                 tunability. Simulation results show that our
                 load-management algorithm performs comparably to a
                 prescient algorithm.",
  acknowledgement = ack-nhfb,

  author =       "Kiran Nagaraja and Neeraj Krishnan and Ricardo
                 Bianchini and Richard P. Martin and Thu D. Nguyen",
  title =        "Quantifying and Improving the Availability of
                 High-Performance Cluster-Based {Internet} Services",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#2;
  abstract =     "Cluster-based servers can substantially increase
                 performance when nodes cooperate to globally manage
                 resources. However, in this paper we show that
                 cooperation results in a substantial availability loss,
                 in the absence of high-availability mechanisms.
                 Specifically, we show that a sophisticated
                 cluster-based Web server, which gains a factor of 3 in
                 performance through cooperation, increases service
                 unavailability by a factor of 10 over a non-cooperative
                 version. We then show how to augment this Web server
                 with software components embodying a small set of
                 high-availability techniques to regain the lost
                 availability. Among other interesting observations, we
                 show that the application of multiple high-availability
                 techniques, each implemented independently in its own
                 subsystem, can lead to inconsistent recovery actions.
                 We also show that a novel technique called Fault Model
                 Enforcement can be used to resolve such
                 inconsistencies. Augmenting the server with these
                 techniques led to a final expected availability of
                 close to 99.99\%.",
  acknowledgement = ack-nhfb,

  author =       "Philip C. Roth and Dorian C. Arnold and Barton P.
  title =        "{MRNet}: {A} Software-Based Multicast\slash Reduction
                 Network for Scalable Tools",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#0;
  abstract =     "We present MRNet, a software-based multicast/reduction
                 network for building scalable performance and system
                 administration tools. MRNet supports multiple
                 simultaneous, asynchronous collective communication
                 operations. MRNet is flexible, allowing tool builders
                 to tailor its process network topology to suit their
                 tool's requirements and the underlying system's
                 capabilities. MRNet is extensible, allowing tool
                 builders to incorporate custom data reductions to
                 augment its collection of built-in reductions. We
                 evaluated MRNet in a simple test tool and also
                 integrated into an existing, real-world performance
                 tool with up to 512 tool back-ends. In the real-world
                 tool, we used MRNet not only for multicast and simple
                 data reductions but also with custom histogram and
                 clock skew detection reductions. In our experiments,
                 the MRNet-based tools showed significantly better
                 performance than the tools without MRNet for average
                 message latency and throughput, overall tool start-up
                 latency, and performance data processing throughput.",
  acknowledgement = ack-nhfb,
  keywords =     "aggregation; scalability; tools; multicast;

  author =       "Barton Miller and Ana Cort{\'e}s and Miquel Senar and
                 Miron Livny",
  title =        "The {Tool Daemon Protocol}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#1;
  abstract =     "Run-time tools are crucial to program development. In
                 our desktop computer environments, we take for granted
                 the availability of tools for operations such as
                 debugging, profiling, tracing, checkpointing, and
                 visualization. When programs move into distributed or
                 Grid environments, it is difficult to find such tools.
                 This difficulty is caused by the complex interactions
                 necessary between application program, operating system
                 and layers of job scheduling and process management
                 software. As a result, each run-time tool must be
                 individually ported to run under a particular job
                 management system; for $m$ tools and $n$ environments,
                 the problem becomes an $m \times n$ effort, rather than
                 the hoped-for $m + n$ effort. Variations in underlying
                 operating systems can make this problem even worse. The
                 consequence of this situation is a paucity of tools in
                 distributed and Grid computing environments. In
                 response to the problem, we have analyzed a variety of
                 job scheduling environments and run-time tools to
                 better understand their interactions. From this
                 analysis, we isolated what we believe are the essential
                 interactions between the runtime tool, job scheduler
                 and resource manager, and application program. We are
                 proposing a standard interface, called the Tool
                 D{\ae}mon Protocol (TDP) that codifies these
                 interactions and provides the necessary communication
                 functions. We have implemented a pilot TDP library and
                 experimented with Parador, a prototype using the
                 Paradyn Parallel Performance tools profiling jobs
                 running under the Condor batch-scheduling
  acknowledgement = ack-nhfb,

  author =       "Lingyun Yang and Jennifer M. Schopf and Ian Foster",
  title =        "Conservative Scheduling: Using Predicted Variance to
                 Improve Scheduling Decisions in Dynamic Environments",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#2;
  abstract =     "In heterogeneous and dynamic environments, efficient
                 execution of parallel computations can require mappings
                 of tasks to processors whose performance is both
                 irregular (because of heterogeneity) and time-varying
                 (because of dynamicity). While adaptive domain
                 decomposition techniques have been used to address
                 heterogeneous resource capabilities, temporal
                 variations in those capabilities have seldom been
                 considered. We propose a conservative scheduling policy
                 that uses information about expected future variance in
                 resource capabilities to produce more efficient data
                 mapping decisions. We first present techniques, based
                 on time series predictors that we developed in previous
                 work, for predicting CPU load at some future time
                 point, average CPU load for some future time interval,
                 and variation of CPU load over some future time
                 interval. We then present a family of stochastic
                 scheduling algorithms that exploit such predictions of
                 future availability and variability when making data
                 mapping decisions. Finally, we describe experiments in
                 which we apply our techniques to an astrophysics
                 application. The results of these experiments
                 demonstrate that conservative scheduling can produce
                 execution times that are both significantly faster and
                 less variable than other techniques.",
  acknowledgement = ack-nhfb,

  author =       "Yonghua Ding and Zhiyuan Li",
  title =        "A Compiler Analysis of Interprocedural Data
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#0;
  abstract =     "This paper presents a compiler analysis for data
                 communication for the purpose of transforming ordinary
                 programs into ones that run on distributed systems.
                 Such transformations have been used for process
                 migration and computation offloading to improve the
                 performance of mobile computing devices. In a
                 client-server distributed environment, the efficiency
                 of an application can be improved by careful
                 partitioning of tasks between the server and the
                 client. Optimal task partitioning depends on the
                 tradeoff between the computation workload and the
                 communication cost. Our compiler analysis, assisted by
                 a minimum set of user assertions, estimates the amount
                 of data communication between procedures. The paper
                 also presents experimental results based on an
                 implementation in the GCC compiler. The static
                 estimates for several multimedia programs are compared
                 against dynamic measurement performed using Shade, a
                 SUN Microsystem's instruction-level simulator. The
                 results show a high precision of the static analysis
                 for most pairs of the procedures.",
  acknowledgement = ack-nhfb,

  author =       "Arun Chauhan and Cheryl McCosh and Ken Kennedy and
                 Richard Hanson",
  title =        "Automatic Type-Driven Library Generation for
                 Telescoping Languages",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#1;
  abstract =     "Telescoping languages is a strategy to automatically
                 generate highly-optimized domain-specific libraries.
                 The key idea is to create specialized variants of
                 library procedures through extensive offline
                 processing. This paper describes a telescoping system,
                 called ARGen, which generates high-performance Fortran
                 or C libraries from prototype Matlab code for the
                 linear algebra library, ARPACK. ARGen uses variable
                 types to guide procedure specializations on possible
                 calling contexts.\par

                 ARGen needs to infer Matlab types in order to speculate
                 on the possible variants of library procedures, as well
                 as to generate code. This paper shows that our
                 type-inference system is powerful enough to generate
                 all the variants needed for ARPACK automatically from
                 the Matlab development code. The ideas demonstrated
                 here provide a basis for building a more general
                 telescoping system for Matlab.",
  acknowledgement = ack-nhfb,

  author =       "Wei Du and Renato Ferreira and Gagan Agrawal",
  title =        "Compiler Support for Exploiting Coarse-Grained
                 Pipelined Parallelism",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#2;
  abstract =     "The emergence of grid and a new class of data-driven
                 applications is making a new form of parallelism
                 desirable, which we refer to as coarse-grained
                 pipelined parallelism. This paper reports on a
                 compilation system developed to exploit this form of
                 parallelism. We use a dialect of Java that exposes both
                 pipelined and data parallelism to the compiler. Our
                 compiler is responsible for selecting a set of
                 candidate filter boundaries, determining the volume of
                 communication required if a particular boundary is
                 chosen, performing the decomposition, and generating
                 code. We have developed a one-pass algorithm for
                 determining the required communication between
                 consecutive filters. We have developed a cost model for
                 estimating the execution time for a given
                 decomposition, and a dynamic programming algorithm for
                 performing the decomposition. Detailed evaluation of
                 our current compiler using four data-driven
                 applications demonstrate the feasibility of our
  acknowledgement = ack-nhfb,

  author =       "Dong Lu and Peter August Dinda",
  title =        "Synthesizing Realistic Computational Grids",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#0;
  abstract =     "Realistic workloads are essential in evaluating
                 middleware for computational grids. One important
                 component is the raw grid itself: a network topology
                 graph annotated with the hardware and software
                 available on each node and link. This paper defines our
                 requirements for grid generation and presents GridG,
                 our extensible generator. We describe GridG in two
                 steps: topology generation and annotation. For topology
                 generation, we have both model and mechanism. We extend
                 Tiers, an existing tool from the networking community,
                 to produce graphs that obey recently discovered power
                 laws of Internet topology. We also contribute to
                 network topology theory by illustrating a contradiction
                 between two laws and proposing a new version of one of
                 them. For annotation, GridG captures intra- and
                 inter-host correlations between attributes using
                 conditional probability rules. We construct a set of
                 rules, including one based on empirical evidence of OS
                 concentration in subnets, that produce sensible host
  acknowledgement = ack-nhfb,

  author =       "Xin Liu and Andrew A. Chien",
  title =        "Traffic-based Load Balance for Scalable Network
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#1;
  abstract =     "Load balance is critical to achieving scalability for
                 large network emulation studies, which are of
                 compelling interest for emerging Grid, Peer to Peer,
                 and other distributed applications and middleware.
                 Achieving load balance in emulation is difficult
                 because of irregular network structure and
                 unpredictable network traffic. We formulate load
                 balance as a graph partitioning problem and apply
                 classical graph partitioning algorithms to it. The
                 primary challenge in this approach is how to extract
                 useful information from the network emulation and
                 present it to the graph partitioning algorithms in a
                 way that reflects the load balance requirement in the
                 original emulation problem. Using a large-scale network
                 emulation system called MaSSF, we explore three
                 approaches for partitioning, based on purely static
                 topology information (TOP), combining topology and
                 application placement information (PLACE), and
                 combining topology and application profile data
                 (PROFILE). These studies show that exploiting static
                 topology and application placement information can
                 achieve reasonable load balance, but a profile-based
                 approach further improves load balance for even large
                 scale network emulation. In our experiments, PROFILE
                 improves load balance by 50\% to 66\% and emulation
                 time is reduced up to 50\% compared to purely static
                 topology-based approaches.",
  acknowledgement = ack-nhfb,

  author =       "Ali Raza Butt and Rongmei Zhang and Y. Charlie Hu",
  title =        "A Self-Organizing Flock of {Condors}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#2;
  abstract =     "Condor provides high throughput computing by
                 leveraging idle cycles on off-the-shelf desktop
                 machines. It also supports flocking, a mechanism for
                 sharing resources among Condor pools. Since Condor
                 pools distributed over a wide area can have dynamically
                 changing availability and sharing preferences, the
                 current flocking mechanism based on static
                 configurations can limit the potential of sharing
                 resources across Condor pools. This paper presents a
                 technique for resource discovery in distributed Condor
                 pools using peer-to-peer mechanisms that are
                 self-organizing, fault-tolerant, scalable, and
                 locality-aware. Locality-awareness guarantees that
                 applications are not shipped across long distances when
                 nearby resources are available. Measurements using a
                 synthetic job trace show that self-organized flocking
                 reduces the maximum job wait time in queue for a
                 heavily loaded pool by a factor of 10 compared to
                 without flocking. Simulations of 1000 Condor pools are
                 also presented and the results confirm that our
                 technique discovers and utilizes nearby resources in
                 the physical network.",
  acknowledgement = ack-nhfb,

  author =       "Ryan M. Olson and Michael W. Schmidt and Mark S.
                 Gordon and Alistair P. Rendell",
  title =        "Enabling the Efficient Use of {SMP} Clusters: The
                 {GAMESS\slash DDI} Model",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#0;
  abstract =     "An important advance in cluster computing is the
                 evolution from single processor clusters to
                 multiprocessor SMP clusters. Due to the increased
                 complexity in the memory model on SMP clusters, new
                 approaches are needed for applications that make use of
                 distributed-memory paradigms. This paper presents new
                 communications software developments that are designed
                 to take advantage of SMP cluster hardware. Although the
                 specific focus is on the central field of computational
                 chemistry and materials science, as embodied in the
                 popular electronic structure package GAMESS (General
                 Atomic and Molecular Electronic Structure System), the
                 impact of these new developments will be far broader in
                 scope. Following a summary of the essential features of
                 the distributed data interface (DDI) in the current
                 implementation of GAMESS, the new developments for SMP
                 clusters are described. The advantages of these new
                 features are illustrated using timing benchmarks on
                 several hardware platforms, using a typical
                 computational chemistry application.",
  acknowledgement = ack-nhfb,

  author =       "Jin Ding and Jian Huang and Micah Beck and Shaotao Liu
                 and Terry Moore and Stephen Soltesz",
  title =        "Remote Visualization by Browsing Image Based Databases
                 with Logistical Networking",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#1;
  abstract =     "The need to provide remote visualization of large
                 datasets with adequate levels of quality and
                 interactivity has become a major impediment to
                 distributed collaboration in Computational Science.
                 Although Image Based Rendering (IBR) techniques based
                 on plenoptic functions have some important advantages
                 over other approaches to this problem, they suffer from
                 an inability to deal with issues of network latency and
                 server load, due to the large size of the IBR databases
                 they generate. Consequently, IBR techniques have been
                 left largely unexplored for this purpose. In this paper
                 we describe strategies for addressing these obstacles
                 using Logistical Networking (LoN), which is a new and
                 highly scalable approach to deploying storage as a
                 shared communication resource. Leveraging LoN
                 technology and infrastructure, we developed a remote
                 visualization system based on concepts of light field
                 rendering, an IBR method using a 4-D plenoptic
                 function. Our system extends existing work on light
                 fields by employing a modified method of
                 parameterization and data organization that supports
                 more efficient prefetching, caching and loss-less
                 compression. Using this approach, we have been able to
                 interactively browse multi-gigabyte, high-resolution
                 light field databases across the wide area network at
                 30 frames per second.",
  acknowledgement = ack-nhfb,

  author =       "Kwan-Liu Ma and Aleksander Stompel and Jacobo Bielak
                 and Omar Ghattas and Eui Joong Kim",
  title =        "Visualizing Very Large-Scale Earthquake Simulations",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#2;
  abstract =     "This paper presents a parallel adaptive rendering
                 algorithm and its performance for visualizing
                 time-varying unstructured volume data generated from
                 large-scale earthquake simulations. The objective is to
                 visualize 3D seismic wave propagation generated from a
                 0.5 Hz simulation of the Northridge earthquake, which
                 is the highest resolution volume visualization of an
                 earthquake simulation performed to date. This scalable
                 high-fidelity visualization solution we provide to the
                 scientists allows them to explore in the temporal,
                 spatial, and visualization domain of their data at high
                 resolution. This new high resolution explorability,
                 likely not presently available to most computational
                 science groups, will help lead to many new insights.
                 The performance study we have conducted on a massively
                 parallel computer operated at the Pittsburgh
                 Supercomputing Center helps direct our design of a
                 simulation-time visualization strategy for the
                 higher-resolution, 1Hz and 2 Hz, simulations.",
  acknowledgement = ack-nhfb,
  keywords =     "earthquake modeling; high-performance computing;
                 massively parallel supercomputing; scientific
                 visualization; parallel rendering; time-varying data;
                 unstructured grids; volume rendering; wave

  author =       "Jiuxing Liu and Balasubramanian Chandrasekaran and
                 Jiesheng Wu and Weihang Jiang and Sushmitha Kini and
                 Weikuan Yu and Darius Buntinas and Pete Wyckoff and D.
                 K. Panda",
  title =        "Performance Comparison of {MPI} Implementations over
                 {InfiniBand}, {Myrinet} and {Quadrics}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#0;
  abstract =     "In this paper, we present a comprehensive performance
                 comparison of MPI implementations over InfiniBand,
                 Myrinet and Quadrics. Our performance evaluation
                 consists of two major parts. The first part consists of
                 a set of MPI level micro-benchmarks that characterize
                 different aspects of MPI implementations. The second
                 part of the performance evaluation consists of
                 application level benchmarks. We have used the NAS
                 Parallel Benchmarks and the sweep3D benchmark. We not
                 only present the overall performance results, but also
                 relate application communication characteristics to the
                 information we acquired from the micro-benchmarks. Our
                 results show that the three MPI implementations all
                 have their advantages and disadvantages. For our 8-node
                 cluster, InfiniBand can offer significant performance
                 improvements for a number of applications compared with
                 Myrinet and Quadrics when using the PCI-X bus. Even
                 with just the PCI bus, InfiniBand can still perform
                 better if the applications are bandwidth-bound.",
  acknowledgement = ack-nhfb,

  author =       "Aurelien Bouteiller and Franck Cappello and Thomas
                 Herault and Geraud Krawezik and Pierre Lemarinier and
                 Frederic Magniette",
  title =        "{MPICH-V2}: a Fault Tolerant {MPI} for Volatile Nodes
                 based on Pessimistic Sender Based Message Logging",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#1;
  abstract =     "Execution of MPI applications on clusters and Grid
                 deployments suffering from node and network failures
                 motivates the use of fault tolerant MPI
                 implementations. We present MPICH-V2 (the second
                 protocol of MPICHV project), an automatic fault
                 tolerant MPI implementation using an innovative
                 protocol that removes the most limiting factor of the
                 pessimistic message logging approach: reliable logging
                 of in transit messages. MPICH-V2 relies on
                 uncoordinated checkpointing, sender based message
                 logging and remote reliable logging of message logical
                 clocks. This paper presents the architecture of
                 MPICH-V2, its theoretical foundation and the
                 performance of the implementation. We compare MPICH-V2
                 to MPICH-V1 and MPICH-P4 evaluating (a) its
                 point-to-point performance, (b) the performance for the
                 NAS benchmarks, (c) the application performance when
                 many faults occur during the execution. Experimental
                 results demonstrate that MPICH-V2 provides performance
                 close to MPICH-P4 for applications using large messages
                 while reducing dramatically the number of reliable
                 nodes compared to MPICH-V1.",
  acknowledgement = ack-nhfb,

  author =       "Stephen D. Kleban and Scott H. Clearwater",
  title =        "Hierarchical Dynamics, Interarrival Times, and
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#2;
  abstract =     "We report on a model of the distribution of job
                 submission interarrival times in supercomputers.
                 Interarrival times are modeled as a consequence of a
                 complicated set of decisions between users, the queuing
                 algorithm, and other policies. This cascading hierarchy
                 of decision-making processes leads to a particular kind
                 of heavy-tailed distribution. Specifically,
                 hierarchically constrained systems suggest that fatter
                 tails are due to more levels coming into play in the
                 overall decision-making process. The key contribution
                 of this paper is that heavier tails resulting from more
                 complex decision-making processes, that is more
                 hierarchical levels, will lead to overall worse
                 performance, even when the average interarrival time is
                 the same. Finally, we offer some suggestions for how to
                 overcome these issues and the tradeoffs involved.",
  acknowledgement = ack-nhfb,
  keywords =     "hierarchy; relaxation process; interarrival; ASCI
                 queueing; dynamics",

  author =       "Mark F. Adams and Harun H. Bayraktar and Tony M.
                 Keaveny and Panayiotis Papadopoulos",
  title =        "Applications of Algebraic Multigrid to Large-Scale
                 Finite Element Analysis of Whole Bone Micro-Mechanics
                 on the {IBM SP}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#0;
  abstract =     "Accurate micro-finite element analyses of whole bones
                 require the solution of large sets of algebraic
                 equations. Multigrid has proven to be an effective
                 approach to the design of highly scalable linear
                 solvers for solid mechanics problems. We present some
                 of the first applications of scalable linear solvers,
                 on massively parallel computers, to whole vertebral
                 body structural analysis. We analyze the performance of
                 our algebraic multigrid (AMG) methods on problems with
                 over 237 million degrees of freedom on IBM SP parallel
                 computers. We demonstrate excellent parallel
                 scalability, both in the algorithms and the
                 implementations, and analyze the nodal performance of
                 the important AMG kernels on the IBM Power3 and Power4
  acknowledgement = ack-nhfb,
  keywords =     "multigrid; trabecular bone; human vertebral body;
                 finite element method; massively parallel computing.",

  author =       "Kai Wang and Jun Zhang and Chi Shen",
  title =        "Parallel Multilevel Sparse Approximate Inverse
                 Preconditioners in Large Sparse Matrix Computations",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#1;
  abstract =     "We investigate the use of the multistep successive
                 preconditioning strategies (MSP) to construct a class
                 of parallel multilevel sparse approximate inverse (SAI)
                 preconditioners. We do not use independent set
                 ordering, but a diagonal dominance based matrix
                 permutation to build a multilevel structure. The
                 purpose of introducing multilevel structure into SAI is
                 to enhance the robustness of SAI for solving difficult
                 problems. Forward and backward preconditioning
                 iteration and two Schur complement preconditioning
                 strategies are proposed to improve the performance and
                 to reduce the storage cost of the multilevel
                 preconditioners. One version of the parallel multilevel
                 SAI preconditioner based on the MSP strategy is
                 implemented. Numerical experiments for solving a few
                 sparse matrices on a distributed memory parallel
                 computer are reported.",
  acknowledgement = ack-nhfb,

  author =       "Ji Qiang and Miguel A. Furman and Robert D. Ryne",
  title =        "Parallel Particle-In-Cell Simulation of Colliding
                 Beams in High Energy Accelerators",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#2;
  abstract =     "In this paper we present a self-consistent simulation
                 model of colliding beams in high energy accelerators.
                 The model, which is based on a particle-in-cell method,
                 uses a new developed shifted-Green function algorithm
                 for the efficient calculation of the beam-beam
                 interaction. The model uses transfer maps to treat the
                 external focusing elements and a stochastic map to
                 treat radiation damping and quantum excitation of the
                 beams. In the parallel implementation we studied
                 various strategies to deal with the particular nature
                 of the colliding beam system --- a system in which
                 there can be significant particle movement between
                 beam-beam collisions. We chose a particle-field
                 decomposition approach instead of the conventional
                 domain decomposition or particle decomposition
                 approach. The particle-field approach leads to good
                 load balance, reduced communication cost, and shows the
                 best scalability on an IBM SP3 among the three parallel
                 implementations we studied. A performance test of the
                 beam-beam model on a Cray T3E, IBM SP3, and a PC
                 cluster is presented. As an application, we studied the
                 effect of long-range collisions on antiproton lifetime
                 in the Fermilab Tevatron.",
  acknowledgement = ack-nhfb,

  author =       "Peter Dinda and Dong Lu",
  title =        "Nondeterministic Queries in a Relational Grid
                 Information Service",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#0;
  abstract =     "A Grid Information Service (GIS) stores information
                 about the resources of a distributed computing
                 environment and answers questions about it. We are
                 developing RGIS, a GIS system based on the relational
                 data model. RGIS users can write SQL queries that
                 search for complex compositions of resources that meet
                 collective requirements. Executing these queries can be
                 very expensive, however. In response, we introduce the
                 nondeterministic query, an extension to the SELECT
                 statement, which allows the user (and RGIS) to trade
                 off between the query's running time and the number of
                 results. The results are a random sample of the
                 deterministic results, which we argue is sufficient and
                 appropriate. Herein we describe RGIS, the
                 nondeterministic query extension, and its
                 implementation. Our evaluation shows that a meaningful
                 tradeoff between query time and results returned is
                 achievable, and that the tradeoff can be used to keep
                 query time largely independent of query complexity.",
  acknowledgement = ack-nhfb,

  author =       "Tahsin Kurc and Feng Lee and Gagan Agrawal and Umit
                 Catalyurek and Renato Ferreira and Joel Saltz",
  title =        "Optimizing Reduction Computations In a Distributed
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#1;
  abstract =     "We investigate runtime strategies for data-intensive
                 applications that involve generalized reductions on
                 large, distributed datasets. Our set of strategies
                 includes replicated filter state, partitioned filter
                 state, and hybrid options between these two extremes.
                 We evaluate these strategies using emulators of three
                 real applications, different query and output sizes,
                 and a number of configurations. We consider execution
                 in a homogeneous cluster and in a distributed
                 environment where only a subset of nodes host the data.
                 Our results show replicating the filter state scales
                 well and outperforms other schemes, if sufficient
                 memory is available and sufficient computation is
                 involved to offset the cost of global merge step. In
                 other cases, hybrid is usually the best. Moreover, in
                 almost all cases, the performance of the hybrid
                 strategy is quite close to the best strategy. Thus, we
                 believe that hybrid is an attractive approach when the
                 relative performance of different schemes cannot be
  acknowledgement = ack-nhfb,

  author =       "Hongzhang Shan and Leonid Oliker and Rupak Biswas",
  title =        "Job Superscheduler Architecture and Performance in
                 Computational Grid Environments",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#2;
  abstract =     "Computational grids hold great promise in utilizing
                 geographically separated heterogeneous resources to
                 solve large-scale complex scientific problems. However,
                 a number of major technical hurdles, including
                 distributed resource management and effective job
                 scheduling, stand in the way of realizing these gains.
                 In this paper, we propose a novel grid superscheduler
                 architecture and three distributed job migration
                 algorithms. We also model the critical interaction
                 between the superscheduler and autonomous local
                 schedulers. Extensive performance comparisons with
                 ideal, central, and local schemes using real workloads
                 from leading computational centers are conducted in a
                 simulation environment. Additionally, synthetic
                 workloads are used to perform a detailed sensitivity
                 analysis of our superscheduler. Several key metrics
                 demonstrate that substantial performance gains can be
                 achieved via smart superscheduling in distributed
                 computational grids.",
  acknowledgement = ack-nhfb,

  author =       "Ranjesh G. Jaganathan and Keith D. Underwood and Ron
                 R. Sass",
  title =        "A Configurable Network Protocol for Cluster Based
                 Communications using Modular Hardware Primitives on an
                 Intelligent {NIC}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#0;
  abstract =     "The high overhead of generic protocols like TCP/IP
                 provides strong motivation for the development of a
                 better protocol architecture for cluster-based parallel
                 computers. Reconfigurable computing has a unique
                 opportunity to contribute hardware level protocol
                 acceleration while retaining the flexibility to adapt
                 to changing needs. Specifically, applications on a
                 cluster have various quality of service needs. In
                 addition, these applications typically run for a long
                 time relative to the reconfiguration time of an FPGA.
                 Thus, it is possible to provide application-specific
                 protocol processing to improve performance and reduce
                 space utilization. Reducing space utilization permits
                 the use of a greater portion of the FPGA for other
                 application-specific processing. This paper focuses on
                 work to create a set of parameterizable components that
                 can be put together as needed to obtain a customized
                 protocol for each application. To study the feasibility
                 of such an architecture, hardware components were built
                 that can be stitched together as needed to provide the
                 required functionality. Feasibility is demonstrated
                 using four different protocol configurations, namely:
                 (1) unreliable packet transfer; (2) reliable, unordered
                 message transfer without duplicate elimination; (3)
                 reliable, unordered message transfer with duplicate
                 elimination; and (4) reliable, ordered message transfer
                 with duplicate elimination. The different
                 configurations illustrate trade-offs between chip space
                 and functionality.",
  acknowledgement = ack-nhfb,

  author =       "Wu-chun Feng and Justin and Hurwitz and Harvey B.
                 Newman and Sylvain Ravot and Roger Les Cottrell and
                 Olivier Martin and Fabrizio Coccetti and Cheng Jin and
                 David Wei and Steven Low",
  title =        "Optimizing 10-Gigabit {Ethernet} in Networks of
                 Workstations, Clusters, and Grids: {A} Case Study",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#1;
  abstract =     "This paper presents a case study of the 10-Gigabit
                 Ethernet (10GbE) adapter from Intel. Specifically,
                 with appropriate optimizations to the configurations of
                 the 10GbE adapter and TCP, we demonstrate that the
                 10GbE adapter can perform well in local-area,
                 storage-area, system-area, and wide-area networks. For
                 local-area, storage-area, and system-area networks in
                 support of networks of workstations, network-attached
                 storage, and clusters, respectively, we can achieve
                 over 7-Gb/s end-to-end throughput and 12$\mu$s
                 end-to-end latency between applications running on
                 Linux-based PCs. For the wide-area network in support
                 of grids, we broke the recently-set Internet2 Land
                 Speed Record by 2.5 times by sustaining an end-to-end
                 TCP/IP throughput of 2.38 Gb/s between Sunnyvale,
                 California and Geneva, Switzerland (i.e., 10,037
                 kilometers) to move over a terabyte of data in less
                 than an hour. Thus, the above results indicate that
                 10GbE may be a cost-effective solution across a
                 multitude of computing environments.",
  acknowledgement = ack-nhfb,

  author =       "Salvador Coll and Jose Duato and Fabrizio Petrini and
                 Francisco J. Mora",
  title =        "Scalable Hardware-Based Multicast Trees",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#2;
  abstract =     "This paper presents an algorithm for implementing
                 optimal hardware-based multicast trees, on networks
                 that provide hardware support for collective
                 communication. Although the proposed methodology can be
                 generalized to a wide class of networks, we apply our
                 methodology to the Quadrics network, a state-of-the-art
                 network that provides hardware-based multicast
                 communication. The proposed mechanism is intended to
                 improve the performance of the collective communication
                 patterns on the network, in those cases where the
                 hardware support can not be directly used, for
                 instance, due to some faulty nodes. This scheme
                 provides significant reduction on multicast latencies
                 compared to the original system primitives, which use
                 multicast trees based on unicast communication. A
                 backtracking algorithm to find the optimal solution to
                 the problem is presented. In addition, a greedy
                 algorithm is presented and shown to provide near
                 optimal solutions. Finally, our experimental results
                 show the good performance and scalability of the
                 proposed multicast tree in comparison to the
                 traditional unicast-based multicast trees. Our
                 multicast mechanism doubles barrier synchronization and
                 broadcasts performance when compared to the
                 production-level MPI library.",
  acknowledgement = ack-nhfb,

  author =       "Gregory T. Balls and Scott B. Baden and Phillip
  title =        "{SCALLOP}: {A} Highly Scalable Parallel {Poisson}
                 Solver in Three Dimensions",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#0;
  abstract =     "SCALLOP is a highly scalable solver and library for
                 elliptic partial differential equations on regular
                 block-structured domains. SCALLOP avoids high
                 communication overheads algorithmically by taking
                 advantage of the locality properties inherent to
                 solutions to elliptic PDEs. Communication costs are
                 small, on the order of a few percent of the total
                 running time on up to 1024 processors of NPACI's and
                 NERSC's IBM Power-3 SP systems. SCALLOP trades off
                 numerical overheads against communication. These
                 numerical overheads are independent of the number of
                 processors for a wide range of problem sizes. SCALLOP
                 is implicitly designed for infinite domain (free space)
                 boundary conditions, but the algorithm can be
                 reformulated to accommodate other boundary conditions.
                 The SCALLOP library is built on top of the KeLP
                 programming system and runs on a variety of
  acknowledgement = ack-nhfb,
  keywords =     "computation-intensive applications; parallel and
                 distributed algorithms; program optimization and
                 performance programming",

  author =       "Kengo Nakajima",
  title =        "Parallel Iterative Solvers of {GeoFEM} with Selective
                 Blocking Preconditioning for Nonlinear Contact Problems
                 on the {Earth Simulator}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1;
  abstract =     "An efficient parallel iterative method with selective
                 blocking preconditioning has been developed for
                 symmetric multiprocessor (SMP) cluster architectures
                 with vector processors such as the Earth Simulator.
                 This method is based on a three-level hybrid parallel
                 programming model, which includes message passing for
                 inter-SMP node communication, loop directives by OpenMP
                 for intra-SMP node parallelization and vectorization
                 for each processing element (PE). This method provides
                 robust and smooth convergence and excellent vector and
                 parallel performance in 3D geophysical simulations with
                 contact conditions performed on the Earth Simulator.
                 The selective blocking preconditioning is much more
                 efficient than ILU(1) and ILU(2). Performance for the
                 complicated Southwest Japan model with more than 23 M
                 DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was
                 161.7 GFLOPS, corresponding to 25.3\% of the peak
                 performance for hybrid programming model, and 190.4
                 GFLOPS (29.8\% of the peak performance) for flat MPI,
  acknowledgement = ack-nhfb,

  author =       "George Karypis",
  title =        "Multi-Constraint Mesh Partitioning for Contact\slash
                 Impact Computations",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#2;
  abstract =     "We present a novel approach for decomposing
                 contact/impact computations in which the mesh elements
                 come in contact with each other during the course of
                 the simulation. Effective decomposition of these
                 computations poses a number of challenges as it needs
                 to both balance the computations and minimize the
                 amount of communication that is performed during the
                 finite element and the contact search phase. Our
                 approach achieves the first goal by partitioning the
                 underlying mesh such that it simultaneously balances
                 both the work that is performed during the finite
                 element phase and that performed during contact search
                 phase, while producing subdomains whose boundaries
                 consist of piecewise axes-parallel lines or planes. The
                 second goal is achieved by using a decision tree to
                 decompose the space into rectangular or box-shaped
                 regions that contain contact points from a single
                 partition. Our experimental evaluation on a sequence of
                 100 meshes, shows that this new approach can reduce the
                 overall communication overhead over existing
  acknowledgement = ack-nhfb,

  author =       "Volkan Akcelik and Jacobo Bielak and George Biros and
                 Ioannis Epanomeritakis and Antonio Fernandez and Omar
                 Ghattas and Eui Joong Kim and Julio Lopez and David
                 O'Hallaron and Tiankai Tu and John Urbanic",
  title =        "High Resolution Forward and Inverse Earthquake
                 Modeling on Terascale Computers",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#0;
  abstract =     "For earthquake simulations to play an important role
                 in the reduction of seismic risk, they must be capable
                 of high resolution and high fidelity. We have developed
                 algorithms and tools for earthquake simulation based on
                 multiresolution hexahedral meshes. We have used this
                 capability to carry out 1 Hz simulations of the 1994
                 Northridge earthquake in the LA Basin using 100 million
                 grid points. Our wave propagation solver sustains 1.21
                 teraflop/s for 4 hours on 3000 AlphaServer processors
                 at 80\% parallel efficiency. Because of uncertainties
                 in characterizing earthquake source and basin material
                 properties, a critical remaining challenge is to invert
                 for source and material parameter fields for complex 3D
                 basins from records of past earthquakes. Towards this
                 end, we present results for material and source
                 inversion of high-resolution models of basins
                 undergoing antiplane motion using parallel scalable
                 inversion algorithms that overcome many of the
                 difficulties particular to inverse heterogeneous wave
                 propagation problems.",
  acknowledgement = ack-nhfb,

  author =       "Seung Jo Kim and Chang Sung Lee and Jeong Ho Kim and
                 Minsu Joh and Sangsan Lee",
  title =        "{IPSAP} : {A} High-performance Parallel Finite Element
                 Code for Large-scale Structural Analysis Based on
                 Domain-wise Multifrontal Technique",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#1;
  abstract =     "Most of researches for large-scale parallel structural
                 analysis have focused on iterative solution methods
                 since direct solution methods generally have many
                 difficulties and disadvantages for large-scale
                 problems. However, due to the numerical robustness of
                 direct methods that guarantees the solution to be
                 obtained within estimated time, direct methods are much
                 more desirable for general application of large-scale
                 structural analysis, if the difficulties and
                 disadvantages can be overcome. In this research, we
                 propose the domain-wise multifrontal solver as an
                 efficient direct solver that can overcome most of these
                 difficulties and disadvantages. By using our own
                 structural analysis code IPSAP which uses the proposed
                 solver, we can solve the largest problem ever solved by
                 direct solvers and can sustain 191 Gflop/s with 256
                 CPUs on our self-made cluster system, Pegasus. By
                 implementing the block Lanczos algorithm using our
                 solver, IPSAP can solve eigenproblems with 7 millions
                 of DOFs within one hour.",
  acknowledgement = ack-nhfb,

  author =       "Lexing Ying and George Biros and Denis Zorin and
                 Harper Langston",
  title =        "A new parallel kernel-independent fast multipole
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#2;
  abstract =     "We present a new adaptive fast multipole algorithm and
                 its parallel implementation. The algorithm is
                 kernel-independent in the sense that the evaluation of
                 pairwise interactions does not rely on any analytic
                 expansions, but only utilizes kernel evaluations. The
                 new method provides the enabling technology for many
                 important problems in computational science and
                 engineering. Examples include viscous flows, fracture
                 mechanics and screened Coulombic interactions. Our
                 MPI-based parallel implementation logically separates
                 the computation and communication phases to avoid
                 synchronization in the upward and downward computation
                 passes, and thus allows us to fully exploit computation
                 and communication overlapping. We measure isogranular
                 and fixed-size scalability for a variety of kernels on
                 the Pittsburgh Supercomputing Center's TCS-1
                 AlphaServer on up to 3000 processors. We have solved
                 viscous flow problems with up to 2.1 billion unknowns
                 and we have achieved 1.6 Tflops/s peak performance and
                 1.13 Tflops/s sustained performance.",
  acknowledgement = ack-nhfb,
  keywords =     "Fast multipole methods; adaptive algorithms; massively
                 parallel computing; boundary integral equations; N-body
                 problems; viscous flows",

  author =       "Fabrizio Petrini and Darren J. Kerbyson and Scott
  title =        "The Case of the Missing Supercomputer Performance:
                 Achieving Optimal Performance on the 8,192 Processors
                 of {ASCI Q}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#0;
  abstract =     "In this paper we describe how we improved the
                 effective performance of ASCI Q, the world's
                 second-fastest supercomputer, to meet our expectations.
                 Using an arsenal of performance-analysis techniques
                 including analytical models, custom microbenchmarks,
                 full applications, and simulators, we succeeded in
                 observing a serious --- but previously undetected ---
                 performance problem. We identified the source of the
                 problem, eliminated the problem, and ``closed the
                 loop'' by demonstrating up to a factor of 2 improvement
                 in application performance. We present our methodology
                 and provide insight into performance analysis that is
                 immediately applicable to other large-scale
  acknowledgement = ack-nhfb,

  author =       "Thomas H. {Dunigan, Jr.} and Mark R. Fahey and James
                 B. White III and Patrick H. Worley",
  title =        "Early Evaluation of the {Cray X1}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#1;
  abstract =     "Oak Ridge National Laboratory installed a 32 processor
                 Cray X1 in March, 2003, and will have a 256 processor
                 system installed by October, 2003. In this paper we
                 describe our initial evaluation of the X1 architecture,
                 focusing on microbenchmarks, kernels, and application
                 codes that highlight the performance characteristics of
                 the X1 architecture and indicate how to use the system
                 most efficiently.",
  acknowledgement = ack-nhfb,

  author =       "Leonid Oliker and Andrew Canning and Jonathan Carter
                 and John Shalf and David Skinner and Stephane Ethier
                 and Rupak Biswas and Jahed Djomehri and Rob Van der
  title =        "Evaluation of Cache-based Superscalar and Cacheless
                 Vector Architectures for Scientific Computations",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#2;
  abstract =     "The growing gap between sustained and peak performance
                 for scientific applications is a well-known problem in
                 high end computing. The recent development of parallel
                 vector systems offers the potential to bridge this gap
                 for many computational science codes and deliver a
                 substantial increase in computing capabilities. This
                 paper examines the intranode performance of the NEC
                 SX-6 vector processor and the cache-based IBM Power3/4
                 superscalar architectures across a number of scientific
                 computing areas. First, we present the performance of a
                 microbenchmark suite that examines low-level machine
                 characteristics. Next, we study the behavior of the NAS
                 Parallel Benchmarks. Finally, we evaluate the
                 performance of several scientific computing codes.
                 Results demonstrate that the SX-6 achieves high
                 performance on a large fraction of our applications and
                 often significantly outperforms the cache-based
                 architectures. However, certain applications are not
                 easily amenable to vectorization and would require
                 extensive algorithm and implementation reengineering to
                 utilize the SX-6 effectively.",
  acknowledgement = ack-nhfb,

  author =       "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha",
  title =        "{ParADE}: An {OpenMP} Programming Environment for
                 {SMP} Cluster Systems",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0;
  abstract =     "Demand for programming environments to exploit
                 clusters of symmetric multiprocessors (SMPs) is
                 increasing. In this paper, we present a new programming
                 environment, called ParADE, to enable easy, portable,
                 and high-performance programming on SMP clusters. It is
                 an OpenMP programming environment on top of a
                 multi-threaded software distributed shared memory
                 (SDSM) system with a variant of home-based lazy release
                 consistency protocol. To boost performance, the runtime
                 system provides explicit message-passing primitives to
                 make it a hybrid-programming environment. Collective
                 communication primitives are used for the
                 synchronization and work-sharing directives associated
                 with small data structures, lessening the
                 synchronization overhead and avoiding the implicit
                 barriers of work-sharing directives. The OpenMP
                 translator bridges the gap between the OpenMP
                 abstraction and the hybrid programming interfaces of
                 the runtime system. The experiments with several NAS
                 benchmarks and applications on a Linux-based cluster
                 show promising results that ParADE overcomes the
                 performance problem of the conventional SDSM-based
                 OpenMP environment.",
  acknowledgement = ack-nhfb,
  keywords =     "programming environment; SMP cluster; software
                 distributed shared memory; hybrid programming; OpenMP;

  author =       "D. Brent Weatherly and David K. Lowenthal and Mario
                 Nakazawa and Franklin Lowenthal",
  title =        "{Dyn-MPI}: Supporting {MPI} on Non Dedicated
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#1;
  abstract =     "Distributing data is a fundamental problem in
                 implementing efficient distributed-memory parallel
                 programs. The problem becomes more difficult in
                 environments where the participating nodes are not
                 dedicated to a parallel application. We are
                 investigating the data distribution problem in non
                 dedicated environments in the context of explicit
                 message-passing programs.\par

                 To address this problem, we have designed and
                 implemented an extension to MPI called Dynamic MPI
                 (Dyn-MPI). The key component of Dyn-MPI is its run-time
                 system, which efficiently and automatically
                 redistributes data on the fly when there are changes in
                 the application or the underlying environment. Dyn-MPI
                 supports efficient memory allocation, precise
                 measurement of system load and computation time, and
                 node removal. Performance results show that programs
                 that use Dyn-MPI execute efficiently in non dedicated
                 environments, including up to almost a three-fold
                 improvement compared to programs that do not
                 redistribute data and a 25\% improvement over standard
                 adaptive load balancing techniques.",
  acknowledgement = ack-nhfb,

  author =       "Kevin J. Barker and Nikos P. Chrisochoides",
  title =        "An Evaluation of a Framework for the Dynamic Load
                 Balancing of Highly Adaptive and Irregular Parallel
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#2;
  abstract =     "We present an evaluation of a flexible framework and
                 runtime software system for the dynamic load balancing
                 of asynchronous and highly adaptive and irregular
                 applications. These applications, which include
                 parallel unstructured and adaptive mesh refinement,
                 serve as building blocks for a large class of
                 scientific applications. Extensive study has lead to
                 the development of solutions to the dynamic load
                 balancing problem for loosely synchronous and
                 computation intensive programs; however, these methods
                 are not suitable for asynchronous and highly adaptive
                 applications. We evaluate a new software framework
                 which includes support for an Active Messages style
                 communication mechanism, global name space, transparent
                 object migration, and preemptive decision making. Our
                 results from both a 3-dimensional parallel advancing
                 front mesh generation program, as well as a synthetic
                 microbenchmark, indicate that this new framework
                 out-performs two existing general-purpose, well-known,
                 and widely used software systems for the dynamic load
                 balancing of adaptive and irregular parallel
  acknowledgement = ack-nhfb,

  author =       "John W. Romein and Jaap Heringa and Henri E. Bal",
  title =        "A Million-Fold Speed Improvement in Genomic Repeats
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#0;
  abstract =     "This paper presents a novel, parallel algorithm for
                 generating top alignments. Top alignments are used for
                 finding internal repeats in biological sequences like
                 proteins and genes. Our algorithm replaces an older,
                 sequential algorithm (Repro), which was prohibitively
                 slow for sequence lengths higher than 2000. The new
                 algorithm is an order of magnitude faster ($O(n^3)$
                 rather than $O(n^4)$). The paper presents a three-level
                 parallel implementation of the algorithm: using SIMD
                 multimedia extensions found on present-day processors
                 (a novel technique that can be used to parallelize any
                 application that performs many sequence alignments),
                 using shared-memory parallelism, and using
                 distributed-memory parallelism. It allows processing
                 the longest known proteins (nearly 35000 amino acids).
                 We show exceptionally high speed improvements: between
                 548 and 889 on a cluster of 64 dual-processor machines,
                 compared to the new sequential algorithm. Especially
                 for long sequences, extreme speed improvements over the
                 old algorithm are obtained.",
  acknowledgement = ack-nhfb,

  author =       "Wahid Chrabakh and Rich Wolski",
  title =        "{GridSAT}: {A} Chaff-based Distributed {SAT} Solver
                 for the Grid",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#1;
  abstract =     "We present GridSAT, a parallel and complete
                 satisfiability solver designed to solve non-trivial SAT
                 problem instances using a large number of widely
                 distributed and heterogeneous resources. The GridSAT
                 parallel algorithm uses intelligent backtracking,
                 distributed and carefully scheduled sharing of learned
                 clauses, and clause reduction. Our implementation
                 focuses on dynamic resource acquisition and release to
                 optimize application execution. We show how the large
                 number of computational resources that are available
                 from a Grid can be managed effectively for the
                 application by an automatic scheduler and effective
                 implementation. GridSAT execution speed is compared
                 against the best sequential solver as rated by the
                 SAT2002 competition using a wide variety of problem
                 instances. The results show that GridSAT delivers
                 speed-up for all but one of the test problem instances
                 that are of significant size. In addition, we describe
                 how GridSAT has solved previously unsolved
                 satisfiability problems and the domain science
                 contribution these results make.",
  acknowledgement = ack-nhfb,
  keywords =     "parallel; distributed; satisfiability; computational

  author =       "Werner Vogels",
  title =        "{HPC.NET} --- are {CLI}-based Virtual Machines
                 Suitable for High Performance Computing?",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#2;
  abstract =     "The Common Language Infrastructure is a new,
                 standardized virtual machine that is likely to become
                 popular on several platforms. In this paper we review
                 whether this technology has any future in the
                 high-performance computing community, for example by
                 targeting the same application space as the Java-Grande
                 Forum. We review the technology by benchmarking three
                 implementations of the CLI and compare those with the
                 results on Java virtual machines.",
  acknowledgement = ack-nhfb,

  author =       "Junichiro Makino and Eiichiro Kokubo and Toshiyuki
                 Fukushige and Hiroshi Daisaka",
  title =        "Performance evaluation and tuning of {GRAPE-6} ---
                 towards 40 `real' {Tflops}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#0;
  abstract =     "In this paper, we describe the performance
                 characteristics of GRAPE-6, the sixth-generation
                 special-purpose computer for gravitational many-body
                 problems. GRAPE-6 consists of 2048 custom pipeline
                 chips, each of which integrates six pipeline processors
                 specialized for the calculation of gravitational
                 interaction between particles. The GRAPE hardware
                 performs the evaluation of the interaction. The
                 frontend processors perform all other operations, such
                 as the time integration of the orbits of particles,
                 I/O, on-the-fly analysis etc. The theoretical peak
                 speed of GRAPE-6 is 63.4 Tflops. We present the result
                 of benchmark runs, and discuss the performance
                 characteristics. We also present the measured
                 performance for a few real scientific applications. The
                 best performance so far achieved with real applications
                 is 35.3 Tflops.",
  acknowledgement = ack-nhfb,

  author =       "Dimitri Komatitsch and Seiji Tsuboi and Chen Ji and
                 Jeroen Tromp",
  title =        "A 14.6 billion degrees of freedom, 5 teraflops, 2.5
                 terabyte earthquake simulation on the {Earth
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#1;
  abstract =     "We use 1944 processors of the Earth Simulator to model
                 seismic wave propagation resulting from large
                 earthquakes. Simulations are conducted based upon the
                 spectral-element method, a high-degree finite-element
                 technique with an exactly diagonal mass matrix. We use
                 a very large mesh with 5.5 billion grid points (14.6
                 billion degrees of freedom). We include the full
                 complexity of the Earth, i.e., a three-dimensional
                 wave-speed and density structure, a 3-D crustal model,
                 ellipticity as well as topography and bathymetry. A
                 total of 2.5 terabytes of memory is needed. Our
                 implementation is purely based upon MPI, with loop
                 vectorization on each processor. We obtain an excellent
                 vectorization ratio of 99.3\%, and we reach a
                 performance of 5 teraflops (30\% of the peak
                 performance) on 38\% of the machine. The very high
                 resolution of the mesh allows us to perform fully
                 three-dimensional calculations at seismic periods as
                 low as 5 seconds.",
  acknowledgement = ack-nhfb,

  author =       "Michael S. Warren and Chris L. Fryer and M. Patrick
  title =        "The {Space Simulator}: Modeling the Universe from
                 Supernovae to Cosmology",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#2;
  abstract =     "The Space Simulator is a 294-processor Beowulf cluster
                 with theoretical peak performance just below 1.5
                 Teraflop/s. It is based on the Shuttle XPC SS51G mini
                 chassis. Each node consists of a 2.53 GHz Pentium 4
                 processor, 1 Gb of 333 MHz DDR SDRAM, an 80 Gbyte
                 Maxtor hard drive, and a 3Com 3C996B-T Gigabit Ethernet
                 card. The network is made up of a Foundry FastIron 1500
                 and 800 Gigabit Ethernet switch. Each individual node
                 cost less than $1000, and the entire system cost under
                 $500,000. The cluster achieved Linpack performance of
                 665.1 Gflop/s on 288 processors in October 2002, making
                 it the 85th fastest computer in the world according to
                 the 20th TOP500 list. Performance has since improved to
                 757.1 Linpack Gflop/s, ranking at \#88 on the 21st
                 TOP500 list. This is the first machine in the TOP500 to
                 surpass Linpack price/performance of 1 dollar per
  acknowledgement = ack-nhfb,

  author =       "William J. Dally and Patrick Hanrahan and Mattan Erez
                 and Timothy J. Knight and Francois Labonte and Jung-Ho
                 Ahn and Nuwan Jayasena and Ujval J. Kapasi and Abhishek
                 Das and Jayanth Gummaraju and Ian Buck",
  title =        "{Merrimac}: Supercomputing with Streams",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#0;
  abstract =     "Merrimac uses stream architecture and advanced
                 interconnection networks to give an order of magnitude
                 more performance per unit cost than cluster-based
                 scientific computers built from the same technology.
                 Organizing the computation into streams and exploiting
                 the resulting locality using a register hierarchy
                 enables a stream architecture to reduce the memory
                 bandwidth required by representative applications by an
                 order of magnitude or more. Hence a processing node
                 with a fixed bandwidth (expensive) can support an order
                 of magnitude more arithmetic units (inexpensive). This
                 in turn allows a given level of performance to be
                 achieved with fewer nodes (a 1-PFLOPS machine, for
                 example, with just 8,192 nodes) resulting in greater
                 reliability, and simpler system management. We sketch
                 the design of Merrimac, a streaming scientific computer
                 that can be scaled from a \$20K 2 TFLOPS workstation to
                 a \$20M 2 PFLOPS supercomputer and present the results
                 of some initial application experiments on this
  acknowledgement = ack-nhfb,

  author =       "Makoto Taiji and Tetsu Narumi and Yousuke Ohno and
                 Noriyuki Futatsugi and Atsushi Suenaga and Naoki Takada
                 and Akihiko Konagaya",
  title =        "{Protein Explorer}: {A} Petaflops Special-Purpose
                 Computer System for Molecular Dynamics Simulations",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#1;
  abstract =     "We are developing the `Protein Explorer' system, a
                 petaflops special-purpose computer system for molecular
                 dynamics simulations. The Protein Explorer is a PC
                 cluster equipped with special-purpose engines that
                 calculate nonbonded interactions between atoms, which
                 is the most time-consuming part of the simulations. A
                 dedicated LSI `MDGRAPE-3 chip' performs these force
                 calculations at a speed of 165 gigaflops or higher. The
                 system will have 6,144 MDGRAPE-3 chips to achieve a
                 nominal peak performance of one petaflop. The system
                 will be completed in 2006. In this paper, we describe
                 the project plans and the architecture of the Protein
  acknowledgement = ack-nhfb,

  author =       "Wendell Anderson and Preston Briggs and C. Stephen
                 Hellberg and Daryl W. Hess and Alexei Khokhlov and
                 Marco Lanzagorta and Robert Rosenberg",
  title =        "Early Experience with Scientific Programs on the {Cray
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#2;
  abstract =     "We describe our experiences porting and tuning three
                 scientific programs to the Cray MTA-2, paying
                 particular attention to the problems posed by I/O. We
                 have measured the performance of each of the programs
                 over many different machine configurations and we
                 report on the scalability of each program. In addition,
                 we compare the performance of the MTA with that of an
                 SGI Origin running all three programs.",
  acknowledgement = ack-nhfb,

  author =       "Gurmeet Singh and Shishir Bharathi and Ann Chervenak
                 and Ewa Deelman and Carl Kesselman and Mary Manohar and
                 Sonal Patil and Laura Pearlman",
  title =        "A Metadata Catalog Service for Data Intensive
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#0;
  abstract =     "Advances in computational, storage and network
                 technologies as well as middle ware such as the Globus
                 Toolkit allow scientists to expand the sophistication
                 and scope of data -intensive applications . These
                 applications produce and analyze terabytes and
                 petabytes of data that are distributed in millions of
                 files or objects. To manage these large data sets
                 efficiently , metadata or descriptive information about
                 the data needs to be managed. There are various types
                 of metadata, and it is likely that a range of metadata
                 services will exist in Grid environments that are
                 specialized for particular types of metadata
                 cataloguing and discovery. In this paper, we present
                 the design of a Metadata Catalog Service (MCS) that
                 provides a mechanism for storing and accessing
                 descriptive metadata and allows users to query for data
                 items based on desired attributes. We describe our
                 experience in using the MCS with several applications
                 and present a scalability study of the service.",
  acknowledgement = ack-nhfb,

  author =       "Ewa Deelman and Raymond Plante and Carl Kesselman and
                 Gurmeet Singh and Mei Su and Gretchen Greene and Robert
                 Hanisch and Niall Gaffney and Antonio Volpicelli and
                 James Annis and Vijay Sekhri and Tamas Budavari and
                 Maria Nieto-Santisteban and William O'Mullane and David
                 Bohlender and Tom McGlynn and Arnold Rots and Olga
  title =        "Grid-Based Galaxy Morphology Analysis for the
                 {National Virtual Observatory}",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#1;
  abstract =     "As part of the development of the National Virtual
                 Observatory (NVO), a Data Grid for astronomy, we have
                 developed a prototype science application to explore
                 the dynamical history of galaxy clusters by analyzing
                 the galaxies' morphologies. The purpose of the
                 prototype is to investigate how Grid-based technologies
                 can be used to provide specialized computational
                 services within the NVO environment. In this paper we
                 focus on the key enabling technology components,
                 particularly Chimera and Pegasus which are used to
                 create and manage the computational workflow that must
                 be present to deal with the challenging application
                 requirements. We illustrate how the components
                 interplay with each other and can be driven from a
                 special purpose application portal.",
  acknowledgement = ack-nhfb,

  author =       "Matthew S. Allen and Rich Wolski",
  title =        "The {Livny} and {Plank-Beck} Problems: Studies in Data
                 Movement on the Computational Grid",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#2;
  abstract =     "Research on scheduling this data management has
                 focused on both the problem of distributing the storage
                 load among a set of servers and on replication as a way
                 of ensuring reliability and data proximity. In order to
                 store large data sets and keep their load balanced
                 across many hosts, many applications choose to divide
                 these sets into sections and distribute them. To access
                 these files reliably in spite of individual host
                 failures, these sections are frequently replicated
                 across many file servers. While the projects cited
                 above have each explored these problems in different
                 ways, commonalities among the various successful
                 solutions are beginning to emerge. In this paper, we
                 investigate two such commonalities, identified by noted
                 researchers in the field: Dr. Miron Livny [4] from the
                 University of Wisconsin, and Dr. James Plank [2] and
                 Dr. Micah Beck [3] from the University of Tennessee.",
  acknowledgement = ack-nhfb,

  author =       "Terry Jones and William Tuel and Larry Brenner and
                 Jeff Fier and Patrick Caffrey and Shawn Dawson and Rob
                 Neely and Robert Blackmore and Brian Maskell and Paul
                 Tomlinson and Mark Roberts",
  title =        "Improving the Scalability of Parallel Jobs by adding
                 Parallel Awareness to the Operating System",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#0;
  abstract =     "A parallel application benefits from scheduling
                 policies that include a global perspective of the
                 application's process working set. As the interactions
                 among cooperating processes increase, mechanisms to
                 ameliorate waiting within one or more of the processes
                 become more important. In particular, collective
                 operations such as barriers and reductions are
                 extremely sensitive to even usually harmless events
                 such as context switches among members of the process
                 working set. For the last 18 months, we have been
                 researching the impact of random short-lived
                 interruptions such as timer-decrement processing and
                 periodic daemon activity, and developing strategies to
                 minimize their impact on large processor-count SPMD
                 bulk-synchronous programming styles. We present a novel
                 co-scheduling scheme for improving performance of
                 fine-grain collective activities such as barriers and
                 reductions, describe an implementation consisting of
                 operating system kernel modifications and run-time
                 system, and present a set of empirical results
                 comparing the technique with traditional operating
                 system scheduling. Our results indicate a speedup of
                 over 300\% on synchronizing collectives.",
  acknowledgement = ack-nhfb,

  author =       "Juan Fernandez and Eitan Frachtenberg and Fabrizio
  title =        "{BCS-MPI}: a New Approach in the System Software
                 Design for Large-Scale Parallel Computers",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#1;
  abstract =     "Buffered CoScheduled MPI (BCS-MPI) introduces a new
                 approach to design the communication layer for
                 large-scale parallel machines. The emphasis of BCS-MPI
                 is on the global coordination of a large number of
                 communicating processes rather than on the traditional
                 optimization of the point-to-point performance. BCS-MPI
                 delays the interprocessor communication in order to
                 schedule globally the communication pattern and it is
                 designed on top of a minimal set of collective
                 communication primitives. In this paper we describe a
                 prototype implementation of BCS-MPI and its
                 communication protocols. Several experimental results,
                 executed on a set of scientific applications, show that
                 BCS-MPI can compete with a production-level MPI
                 implementation, but is much simpler to implement, debug
                 and model. Keywords: MPI, buffered coscheduling, STORM,
                 Quadrics, system software, communication protocols,
                 cluster computing, large-scale parallel computers.",
  acknowledgement = ack-nhfb,

  author =       "Adam Moody and Juan Fernandez and Fabrizio Petrini and
                 Dhabaleswar K. Panda",
  title =        "Scalable {NIC}-based Reduction on Large-Scale
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#2;
  abstract =     "Many parallel algorithms require efficient reduction
                 collectives. In response, researchers have designed
                 algorithms considering a range of parameters including
                 data size, system size, and communication
                 characteristics. Throughout this past work, however,
                 processing was limited to the host CPU. Today, modern
                 Network Interface Cards (NICs) sport programmable
                 processors with substantial memory, and thus introduce
                 a fresh variable into the equation. In this paper, we
                 investigate this new option in the context of
                 large-scale clusters. Through experiments on the
                 960-node, 1920-processor ASCI Linux Cluster (ALC) at
                 Lawrence Livermore National Laboratory, we show that
                 NIC-based reductions outperform host-based algorithms
                 in terms of reduced latency and increased consistency.
                 In particular, in the largest configuration tested ---
                 1812 processors --- our NIC-based algorithm summed
                 single-element vectors of 32-bit integers and 64-bit
                 floating-point numbers in 73 $\mu$s and 118 $\mu$s,
                 respectively. These results represent respective
                 improvements of 121\% and 39\% over the
                 production-level MPI library.",
  acknowledgement = ack-nhfb,

  author =       "Joachim Worringen and Jesper Larson Traff and Hubert
  title =        "Fast Parallel Non-Contiguous File Access",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#0;
  abstract =     "Many applications of parallel I/O perform
                 non-contiguous file accesses: instead of accessing a
                 single (large) block of data in a file, a number of
                 (smaller) blocks of data scattered throughout the file
                 needs to be accessed in each logical I/O operation.
                 However, only few file system interfaces directly
                 support this kind of non-contiguous file access. In
                 contrast, the most commonly used parallel programming
                 interface, MPI, incorporates a flexible model of
                 parallel I/O through its MPI-IO interface. With MPI-IO,
                 arbitrary non-contiguous file accesses are supported in
                 a uniform fashion by the use of derived MPI datatypes
                 set up by the user to reflect the desired I/O

                 Despite a considerable amount of recent work in this
                 area, current MPI-IO implementations suffer from low
                 performance of such non-contiguous accesses when
                 compared to the performance of the storage system for
                 contiguous accesses. In this paper we analyze an
                 important bottleneck in the efficient handling of
                 non-contiguous access patterns in current
                 implementations of MPIIO. We present a new technique,
                 termed listless I/O, that can be incorporated into
                 MPI-IO implementations like the well-known ROMIO
                 implementation, and completely eliminates this
                 bottleneck. We have implemented the technique in
                 MPI/SX, the MPI implementation for the NEC SX-series of
                 parallel vector computers. Results with a synthetic
                 benchmark and an application kernel show that listless
                 I/O is able to increase the bandwidth for
                 non-contiguous file access by sometimes more than a
                 factor of 500 when compared to the traditional
  acknowledgement = ack-nhfb,

  author =       "Jianwei Li and Wei-keng Liao and Alok Choudhary and
                 Robert Ross and Rajeev Thakur and William Gropp and Rob
                 Latham and Andrew Siegel and Brad Gallagher and Michael
  title =        "{Parallel netCDF}: {A} High-Performance Scientific
                 {I/O} Interface",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#1;
  abstract =     "Dataset storage, exchange, and access play a critical
                 role in scientific applications. For such purposes
                 netCDF serves as a portable, efficient file format and
                 programming interface, which is popular in numerous
                 scientific application domains. However, the original
                 interface does not provide an efficient mechanism for
                 parallel data storage and access. In this work, we
                 present a new parallel interface for writing and
                 reading netCDF datasets. This interface is derived with
                 minimal changes from the serial netCDF interface but
                 defines semantics for parallel access and is tailored
                 for high performance. The underlying parallel I/O is
                 achieved through MPI-IO, allowing for substantial
                 performance gains through the use of collective I/O
                 optimizations. We compare the implementation strategies
                 and performance with HDF5. Our tests indicate
                 programming convenience and significant I/O performance
                 improvement with this parallel netCDF (PnetCDF)
  acknowledgement = ack-nhfb,

  author =       "Scott Alan Klasky and Stephane Ethier and Zhihong Lin
                 and Kevin Martins and Doug McCune and Ravi Samtaney",
  title =        "Grid-Based Parallel Data Streaming implemented for the
                 Gyrokinetic Toroidal Code",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#2;
  abstract =     "We have developed a threaded parallel data streaming
                 approach using Globus to transfer multi-terabyte
                 simulation data from a remote supercomputer to the
                 scientist's home analysis/visualization cluster, as the
                 simulation executes, with negligible overhead. Data
                 transfer experiments show that this concurrent data
                 transfer approach is more favorable compared with
                 writing to local disk and then transferring this data
                 to be post-processed. The present approach is conducive
                 to using the grid to pipeline the simulation with
                 post-processing and visualization. We have applied this
                 method to the Gyrokinetic Toroidal Code (GTC), a
                 3-dimensional particle-in-cell code used to study
                 micro-turbulence in magnetic confinement fusion from
                 first principles plasma theory.",
  acknowledgement = ack-nhfb,

  author =       "Robert W. Wisniewski and Bryan Rosenburg",
  title =        "Efficient, Unified, and Scalable Performance
                 Monitoring for Multiprocessor Operating Systems",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#0;
  abstract =     "Programming, understanding, and tuning the performance
                 of large multiprocessor systems is challenging. Experts
                 have difficulty achieving good utilization for
                 applications on large machines. The task of
                 implementing a scalable system such as an operating
                 system or database on large machines is even more
                 challenging. And the importance of achieving good
                 performance on multiprocessor machines is increasing as
                 the number of cores per chip increases and as the size
                 of multiprocessors increases. Crucial to achieving good
                 performance is being able to understand the behavior of
                 the system.\par

                 We have developed an efficient, unified, and scalable
                 tracing infrastructure that allows for correctness
                 debugging, performance debugging, and performance
                 monitoring of an operating system. The infrastructure
                 allows variable-length events to be logged without
                 locking and provides random access to the event stream.
                 The infrastructure allows cheap and parallel logging of
                 events by applications, libraries, servers, and the
                 kernel. The infrastructure was designed for K42, a new
                 open-source research kernel designed to scale near
                 perfectly on large cache-coherent 64-bit multiprocessor
                 systems. The techniques are generally applicable, and
                 many of them have been integrated into the Linux Trace
                 Toolkit. In this paper, we describe the implementation
                 of the infrastructure, how we used the facility, e.g.,
                 analyzing lock contention, to understand and achieve
                 K42's scalable performance, and the lessons we learned.
                 The infrastructure has been invaluable to achieving
                 great scalability.",
  acknowledgement = ack-nhfb,

  author =       "Marty Itzkowitz and Brian J. N. Wylie and Christopher
                 Aoki and Nicolai Kosche",
  title =        "Memory Profiling using Hardware Counters",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#1;
  abstract =     "Although memory performance is often a limiting factor
                 in application performance, most tools only show
                 performance data relating to the instructions in the
                 program, not to its data. In this paper, we describe a
                 technique for directly measuring the memory profile of
                 an application. We describe the tools and their user
                 model, and then discuss a particular code, the MCF
                 benchmark from SPEC CPU 2000. We show performance data
                 for the data structures and elements, and discuss the
                 use of the data to improve program performance.
                 Finally, we discuss extensions to the work to provide
                 feedback to the compiler for prefetching and to
                 generate additional reports from the data.",
  acknowledgement = ack-nhfb,

  author =       "Tushar Mohan and Bronis R. de Supinski and Sally A.
                 McKee and Frank Mueller and Andy Yoo and Martin
  title =        "Identifying and Exploiting Spatial Regularity in Data
                 Memory References",
  crossref =     "ACM:2003:SII",
  pages =        "??--??",
  year =         "2003",
  bibdate =      "Wed Nov 26 07:34:20 2003",
  URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#2;
  abstract =     "The growing processor/memory performance gap causes
                 the performance of many codes to be limited by memory
                 accesses. If known to exist in an application, strided
                 memory accesses forming streams can be targeted by
                 optimizations such as prefetching, relocation,
                 remapping, and vector loads. Undetected, they can be a
                 significant source of memory stalls in loops. Existing
                 stream-detection mechanisms either require special
                 hardware, which may not gather statistics for
                 subsequent analysis, or are limited to compile-time
                 detection of array accesses in loops. Formally, little
                 treatment has been accorded to the subject; the concept
                 of locality fails to capture the existence of streams
                 in a program's memory accesses. The contributions of
                 this paper are as follows. First, we define spatial
                 regularity as a means to discuss the presence and
                 effects of streams. Second, we develop measures to
                 quantify spatial regularity, and we design and
                 implement an on-line, parallel algorithm to detect
                 streams -- and hence regularity -- in running
                 applications. Third, we use examples from real codes
                 and common benchmarks to illustrate how derived stream
                 statistics can be used to guide the application of
                 profile-driven optimizations. Overall, we demonstrate
                 the benefits of our novel regularity metric as an
                 instrument to detect potential for code optimizations
                 affecting memory performance.",
  acknowledgement = ack-nhfb,

%%% ====================================================================
%%% Cross-referenced entries must come last:

  editor =       "{ACM}",
  booktitle =    "SC2003: Igniting Innovation. {Phoenix, AZ, November
                 15--21, 2003}",
  title =        "{SC2003}: Igniting Innovation. {Phoenix, AZ, November
                 15--21, 2003}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "2003",
  ISBN =         "1-58113-695-1",
  ISBN-13 =      "978-1-58113-695-1",
  LCCN =         "????",
  bibdate =      "Thu Feb 21 18:29:36 2003",
  acknowledgement = ack-nhfb,