%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.47",
%%%     date            = "27 October 2014",
%%%     time            = "17:15:15 MDT",
%%%     filename        = "taco.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "12546 15242 81841 789806",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Architecture and Code
%%%                        Optimization; bibliography; TACO",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Architecture and Code
%%%                        Optimization (CODEN ????, ISSN 1544-3566),
%%%                        covering all journal issues from 2004 --
%%%                        date.
%%%
%%%                        At version 1.47, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2004 (  17)    2008 (  21)    2012 (  61)
%%%                             2005 (  17)    2009 (  20)    2013 ( 103)
%%%                             2006 (  19)    2010 (  21)    2014 (  28)
%%%                             2007 (  19)    2011 (  17)
%%%
%%%                             Article:        343
%%%
%%%                             Total entries:  343
%%%
%%%                        The journal Web page can be found at:
%%%
%%%                            http://www.acm.org/pubs/taco.html
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://www.acm.org/taco/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J924
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================

@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-TACO                  = "ACM Transactions on Architecture and
                                  Code Optimization"}

%%% ====================================================================
%%% Bibliography entries:

@Article{Calder:2004:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2004:RIC,
  author =       "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir
                 and N. Vijaykrishnan and M. J. Irwin",
  title =        "Reducing instruction cache energy consumption using a
                 compiler-based strategy",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "3--33",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Isailovic:2004:DCQ,
  author =       "Nemanja Isailovic and Mark Whitney and Yatish Patel
                 and John Kubiatowicz and Dean Copsey and Frederic
                 T. Chong and Isaac L. Chuang and Mark Oskin",
  title =        "Datapath and control for quantum wires",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "34--61",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sankaralingam:2004:TPA,
  author =       "Karthikeyan Sankaralingam and Ramadass Nagarajan and
                 Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya
                 Ranganathan and Doug Burger and Stephen W. Keckler and
                 Robert G. McDonald and Charles R. Moore",
  title =        "{TRIPS}: a polymorphous architecture for exploiting
                 {ILP}, {TLP}, and {DLP}",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "62--93",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Skadron:2004:TAM,
  author =       "Kevin Skadron and Mircea R. Stan and Karthik
                 Sankaranarayanan and Wei Huang and Sivakumar Velusamy
                 and David Tarjan",
  title =        "Temperature-aware microarchitecture: {Modeling} and
                 implementation",
  journal =      j-TACO,
  volume =       "1",
  number =       "1",
  pages =        "94--125",
  month =        mar,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:09 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Aleta:2004:RCC,
  author =       "Alex Alet{\`a} and Josep M. Codina and Antonio
                 Gonz{\'a}lez and David Kaeli",
  title =        "Removing communications in clustered
                 microarchitectures through instruction replication",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "127--151",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bai:2004:LPO,
  author =       "Yu Bai and R. Iris Bahar",
  title =        "A low-power in-order\slash out-of-order issue queue",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "152--179",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Juang:2004:IBP,
  author =       "Philo Juang and Kevin Skadron and Margaret Martonosi
                 and Zhigang Hu and Douglas W. Clark and Philip
                 W. Diodato and Stefanos Kaxiras",
  title =        "Implementing branch-predictor decay using quasi-static
                 memory cells",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "180--219",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Santana:2004:LCF,
  author =       "Oliverio J. Santana and Alex Ramirez and Josep L.
                 Larriba-Pey and Mateo Valero",
  title =        "A low-complexity fetch architecture for
                 high-performance superscalar processors",
  journal =      j-TACO,
  volume =       "1",
  number =       "2",
  pages =        "220--245",
  month =        jun,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 5 07:08:10 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2004:CFS,
  author =       "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung
                 Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun Chan",
  title =        "A compiler framework for speculative optimizations",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "247--271",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fields:2004:ICS,
  author =       "Brian A. Fields and Rastislav Bodik and Mark D. Hill
                 and Chris J. Newburn",
  title =        "Interaction cost and shotgun profiling",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "272--304",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sankaranarayanan:2004:PBA,
  author =       "Karthik Sankaranarayanan and Kevin Skadron",
  title =        "Profile-based adaptation for cache decay",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "305--322",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xie:2004:IDV,
  author =       "Fen Xie and Margaret Martonosi and Sharad Malik",
  title =        "Intraprogram dynamic voltage scaling: {Bounding}
                 opportunities with analytic modeling",
  journal =      j-TACO,
  volume =       "1",
  number =       "3",
  pages =        "323--367",
  month =        sep,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Oct 29 06:39:45 MDT 2004",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hartstein:2004:OPD,
  author =       "A. Hartstein and Thomas R. Puzak",
  title =        "The optimum pipeline depth considering both power and
                 performance",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "369--388",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cristal:2004:TKI,
  author =       "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo
                 Valero and Jos{\'e} F. Mart{\'\i}nez",
  title =        "Toward kilo-instruction processors",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "389--417",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Akkary:2004:ARE,
  author =       "Haitham Akkary and Ravi Rajwar and Srikanth T.
                 Srinivasan",
  title =        "An analysis of a resource efficient checkpoint
                 architecture",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "418--444",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yang:2004:TML,
  author =       "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng
                 and Chien-Hao Lee",
  title =        "Tolerating memory latency through push prefetching for
                 pointer-intensive applications",
  journal =      j-TACO,
  volume =       "1",
  number =       "4",
  pages =        "445--475",
  month =        dec,
  year =         "2004",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Apr 14 12:17:47 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Calder:2005:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2005:EFA,
  author =       "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu
                 and Josep Torrellas",
  title =        "Efficient and flexible architectural support for
                 dynamic monitoring",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "3--33",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2005:WHC,
  author =       "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid
                 Najjar",
  title =        "A way-halting cache for low-energy high-performance
                 systems",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "34--54",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Abella:2005:ISP,
  author =       "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera
                 and Michael F. P. O'Boyle",
  title =        "{IATAC}: a smart predictor to turn-off {L2} cache
                 lines",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "55--77",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Haskins:2005:AWS,
  author =       "John W. {Haskins, Jr.} and Kevin Skadron",
  title =        "Accelerated warmup for sampled microarchitecture
                 simulation",
  journal =      j-TACO,
  volume =       "2",
  number =       "1",
  pages =        "78--108",
  month =        mar,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon May 2 11:13:58 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2005:ABT,
  author =       "Tao Li and Ravi Bhargava and Lizy Kurian John",
  title =        "Adapting branch-target buffer to improve the target
                 predictability of {Java} code",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "109--130",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2005:DIE,
  author =       "Lingli Zhang and Chandra Krintz",
  title =        "The design, implementation, and evaluation of adaptive
                 code unloading for resource-constrained devices",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "131--164",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kulkarni:2005:FES,
  author =       "Prasad A. Kulkarni and Stephen R. Hines and David B.
                 Whalley and Jason D. Hiser and Jack W. Davidson and
                 Douglas L. Jones",
  title =        "Fast and efficient searches for effective
                 optimization-phase sequences",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "165--198",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Salami:2005:DMI,
  author =       "Esther Salam{\'\i} and Mateo Valero",
  title =        "Dynamic memory interval test vs. interprocedural
                 pointer analysis in multimedia applications",
  journal =      j-TACO,
  volume =       "2",
  number =       "2",
  pages =        "199--219",
  month =        jun,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 7 14:09:53 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Meng:2005:ELL,
  author =       "Yan Meng and Timothy Sherwood and Ryan Kastner",
  title =        "Exploring the limits of leakage power reduction in
                 caches",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "221--246",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Garzaran:2005:TBS,
  author =       "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic
                 and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor
                 Vi{\~n}als and Lawrence Rauchwerger and Josep Torrellas",
  title =        "Tradeoffs in buffering speculative memory state for
                 thread-level speculation in multiprocessors",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "247--279",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tarjan:2005:MPG,
  author =       "David Tarjan and Kevin Skadron",
  title =        "Merging path and gshare indexing in perceptron branch
                 prediction",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "280--300",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2005:WET,
  author =       "Xiangyu Zhang and Rajiv Gupta",
  title =        "Whole execution traces and their applications",
  journal =      j-TACO,
  volume =       "2",
  number =       "3",
  pages =        "301--334",
  month =        sep,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Oct 5 07:42:22 MDT 2005",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2005:IWA,
  author =       "Wankang Zhao and David Whalley and Christopher Healy
                 and Frank Mueller",
  title =        "Improving {WCET} by applying a {WC} code-positioning
                 optimization",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "335--365",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "WCET (worst case execution time); WC (worst case)",
}

@Article{Reis:2005:SCF,
  author =       "George A. Reis and Jonathan Chang and Neil
                 Vachharajani and Ram Rangan and David I. August and
                 Shubhendu S. Mukherjee",
  title =        "Software-controlled fault tolerance",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "366--396",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2005:PPC,
  author =       "Jian Li and Jos{\'e} F. Mart{\'\i}nez",
  title =        "Power-performance considerations of parallel computing
                 on chip multiprocessors",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "397--422",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sharma:2005:SPE,
  author =       "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte",
  title =        "Spectral prefetcher: {An} effective mechanism for {L2}
                 cache prefetching",
  journal =      j-TACO,
  volume =       "2",
  number =       "4",
  pages =        "423--450",
  month =        dec,
  year =         "2005",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Feb 16 11:03:13 MST 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Calder:2006:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "1--2",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tan:2006:BSS,
  author =       "Lin Tan and Brett Brotherton and Timothy Sherwood",
  title =        "Bit-split string-matching engines for intrusion
                 detection and prevention",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "3--34",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nagpurkar:2006:ERP,
  author =       "Priya Nagpurkar and Hussam Mousa and Chandra Krintz
                 and Timothy Sherwood",
  title =        "Efficient remote profiling for resource-constrained
                 devices",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "35--66",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lin:2006:RCG,
  author =       "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy
                 Dz-Ching Ju and Tin-Fook Ngai",
  title =        "Recovery code generation for general speculative
                 optimizations",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "67--89",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Choi:2006:ORR,
  author =       "Yoonseo Choi and Hwansoo Han",
  title =        "Optimal register reassignment for register stack
                 overflow minimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "1",
  pages =        "90--114",
  month =        mar,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 18 08:38:26 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xue:2006:LOA,
  author =       "Jingling Xue and Qiong Cai",
  title =        "A lifetime optimal algorithm for speculative {PRE}",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "115--155",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sharkey:2006:IPT,
  author =       "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad
                 Ghose and Oguz Ergin",
  title =        "Instruction packing: {Toward} fast and
                 energy-efficient instruction scheduling",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "156--181",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ceze:2006:CUC,
  author =       "Luis Ceze and Karin Strauss and James Tuck and Josep
                 Torrellas and Jose Renau",
  title =        "{CAVA}: {Using} checkpoint-assisted value prediction
                 to hide {L2} misses",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "182--208",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2006:EAR,
  author =       "Lixin Zhang and Mike Parker and John Carter",
  title =        "Efficient address remapping in distributed
                 shared-memory systems",
  journal =      j-TACO,
  volume =       "3",
  number =       "2",
  pages =        "209--229",
  month =        jun,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 9 06:47:22 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2006:ATP,
  author =       "Min Zhao and Bruce R. Childers and Mary Lou Soffa",
  title =        "An approach toward profit-driven optimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "231--262",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162691",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Although optimizations have been applied for a number
                 of years to improve the performance of software,
                 problems with respect to the application of
                 optimizations have not been adequately addressed. For
                 example, in certain circumstances, optimizations may
                 degrade performance. However, there is no efficient way
                 to know when a degradation will occur. In this
                 research, we investigate the profitability of
                 optimizations, which is useful for determining the
                 benefit of applying optimizations. We develop a
                 framework that enables us to predict profitability
                 using analytic models. The profitability of an
                 optimization depends on code context, the particular
                 optimization, and machine resources. Thus, our
                 framework has analytic models for each of these
                 components. As part of the framework, there is also a
                 profitability engine that uses models to predict the
                 profit. In this paper, we target scalar optimizations
                 and, in particular, describe the models for partial
                 redundancy elimination (PRE), loop invariant code
                 motion (LICM), and value numbering (VN). We implemented
                 the framework for predicting the profitability of these
                 optimizations. Based on the predictions, we can
                 selectively apply profitable optimizations. We compared
                 the profit-driven approach with an approach that uses a
                 heuristic in deciding when optimizations should be
                 applied. Our experiments demonstrate that the
                 profitability of scalar optimizations can be accurately
                 predicted by using models. That is, without actually
                 applying a scalar optimization, we can determine if an
                 optimization is beneficial and should be applied.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hazelwood:2006:MBC,
  author =       "Kim Hazelwood and Michael D. Smith",
  title =        "Managing bounded code caches in dynamic binary
                 optimization systems",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "263--294",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162692",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic binary optimizers store altered copies of
                 original program instructions in software-managed code
                 caches in order to maximize reuse of transformed code.
                 Code caches store code blocks that may vary in size,
                 reference other code blocks, and carry a high
                 replacement overhead. These unique constraints reduce
                 the effectiveness of conventional cache management
                 policies. Our work directly addresses these unique
                 constraints and presents several contributions to the
                 code-cache management problem. First, we show that
                 evicting more than the minimum number of code blocks
                 from the code cache results in less run-time overhead
                 than the existing alternatives. Such granular evictions
                 reduce overall execution time, as the fixed costs of
                 invoking the eviction mechanism are amortized across
                 multiple cache insertions. Second, a study of the ideal
                 lifetimes of dynamically generated code blocks
                 illustrates the benefit of a replacement algorithm
                 based on a generational heuristic. We describe and
                 evaluate a generational approach to code cache
                 management that makes it easy to identify long-lived
                 code blocks and simultaneously avoid any fragmentation
                 because of the eviction of short-lived blocks. Finally,
                 we present results from an implementation of our
                 generational approach in the DynamoRIO framework and
                 illustrate that, as dynamic optimization systems become
                 more prevalent, effective code cache-management
                 policies will be essential for reliable, scalable
                 performance of modern applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rochecouste:2006:CCE,
  author =       "Olivier Rochecouste and Gilles Pokam and Andr{\'e}
                 Seznec",
  title =        "A case for a complexity-effective, width-partitioned
                 microarchitecture",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "295--326",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The analysis of program executions reveals that most
                 integer and multimedia applications make heavy use of
                 narrow-width operations, i.e., instructions exclusively
                 using narrow-width operands and producing a
                 narrow-width result. Moreover, this usage is relatively
                 well distributed over the application. We observed this
                 program property on the MediaBench and SPEC2000
                 benchmarks with about 40\% of the instructions being
                 narrow-width operations. Current superscalar processors
                 use 64-bit datapaths to execute all the instructions of
                 the applications. In this paper, we suggest the use of
                 a width-partitioned microarchitecture (WPM) to master
                 the hardware complexity of a superscalar processor. For
                 a four-way issue machine, we split the processor in two
                 two-way clusters: the main cluster executing 64-bit
                 operations, load/store, and complex operations and a
                 narrow cluster executing the 16-bit operations. We
                 resort to partitioning to decouple the treatment of the
                 narrow-width operations from that of the other program
                 instructions. This provides the benefit of greatly
                 simplifying the design of the critical processor
                 components in each cluster (e.g., the register file and
                 the bypass network). The dynamic interleaving of the
                 two instruction types allows maintaining the workload
                 balanced among clusters. WPM also helps to reduce the
                 complexity of the interconnection fabric and of the
                 issue logic. In fact, since the 16-bit cluster can only
                 communicate narrow-width data, the datapath-width of
                 the interconnect fabric can be significantly reduced,
                 yielding a corresponding saving of the interconnect
                 power and area. We explore different possible
                 configurations of WPM, discussing the various
                 implementation tradeoffs. We also examine a speculative
                 steering heuristic to distribute the narrow-width
                 operations among clusters. A detailed analysis of the
                 complexity factors shows using WPM instead of a
                 classical 64-bit two-cluster microarchitecture can save
                 power and silicon area with a minimal impact on the
                 overall performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zmily:2006:BAI,
  author =       "Ahmad Zmily and Christos Kozyrakis",
  title =        "Block-aware instruction set architecture",
  journal =      j-TACO,
  volume =       "3",
  number =       "3",
  pages =        "327--357",
  month =        sep,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1162690.1162694",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Sep 23 07:54:36 MDT 2006",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Instruction delivery is a critical component for
                 wide-issue, high-frequency processors since its
                 bandwidth and accuracy place an upper limit on
                 performance. The processor front-end accuracy and
                 bandwidth are limited by instruction-cache misses,
                 multicycle instruction-cache accesses, and target or
                 direction mispredictions for control-flow operations.
                 This paper presents a block-aware instruction set
                 (BLISS) that allows software to assist with front-end
                 challenges. BLISS defines basic block descriptors that
                 are stored separately from the actual instructions in a
                 program. We show that BLISS allows for a decoupled
                 front-end that tolerates instruction-cache latency,
                 facilitates instruction prefetching, and leads to
                 higher prediction accuracy.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Crandall:2006:MAS,
  author =       "Jedidiah R. Crandall and S. Felix Wu and Frederic T.
                 Chong",
  title =        "{Minos}: {Architectural} support for protecting
                 control data",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "359--389",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Marathe:2006:ACC,
  author =       "Jaydeep Marathe and Frank Mueller and Bronis R. de
                 Supinski",
  title =        "Analysis of cache-coherence bottlenecks with hybrid
                 hardware\slash software techniques",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "390--423",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ganusov:2006:FEP,
  author =       "Ilya Ganusov and Martin Burtscher",
  title =        "Future execution: a prefetching mechanism that uses
                 multiple cores to speed up single threads",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "424--449",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Co:2006:ETC,
  author =       "Michele Co and Dee A. B. Weikle and Kevin Skadron",
  title =        "Evaluating trace cache energy efficiency",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "450--476",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hu:2006:EMM,
  author =       "Shiwen Hu and Madhavi Valluri and Lizy Kurian John",
  title =        "Effective management of multiple configurable units
                 using dynamic optimization",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "477--501",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bentley:2006:IAB,
  author =       "Chris Bentley and Scott A. Watterson and David K.
                 Lowenthal and Barry Rountree",
  title =        "Implicit array bounds checking on 64-bit
                 architectures",
  journal =      j-TACO,
  volume =       "3",
  number =       "4",
  pages =        "502--527",
  month =        dec,
  year =         "2006",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1187976.1187982",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Several programming languages guarantee that array
                 subscripts are checked to ensure they are within the
                 bounds of the array. While this guarantee improves the
                 correctness and security of array-based code, it adds
                 overhead to array references. This has been an obstacle
                 to using higher-level languages, such as Java, for
                 high-performance parallel computing, where the language
                 specification requires that all array accesses must be
                 checked to ensure they are within bounds. This is
                 because, in practice, array-bounds checking in
                 scientific applications may increase execution time by
                 more than a factor of 2. Previous research has explored
                 optimizations to statically eliminate bounds checks,
                 but the dynamic nature of many scientific codes makes
                 this difficult or impossible. Our approach is, instead,
                 to create a compiler and operating system
                 infrastructure that does not generate explicit bounds
                 checks. It instead places arrays inside of Index
                 Confinement Regions (ICRs), which are large, isolated,
                 mostly unmapped virtual memory regions. Any array
                 reference outside of its bounds will cause a protection
                 violation; this provides implicit bounds checking. Our
                 results show that when applying this infrastructure to
                 high-performance computing programs written in Java,
                 the overhead of bounds checking relative to a program
                 with no bounds checks is reduced from an average of
                 63\% to an average of 9\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Calder:2007:I,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Introduction",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Constantinides:2007:ARC,
  author =       "Kypros Constantinides and Stephen Plaza and Jason
                 Blome and Valeria Bertacco and Scott Mahlke and Todd
                 Austin and Bin Zhang and Michael Orshansky",
  title =        "Architecting a reliable {CMP} switch architecture",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hwang:2007:SSA,
  author =       "Yuan-Shin Hwang and Jia-Jhe Li",
  title =        "Snug set-associative caches: {Reducing} leakage power
                 of instruction and data caches with no performance
                 penalties",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luo:2007:CNP,
  author =       "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan",
  title =        "Conserving network processor power consumption by
                 exploiting traffic variability",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rong:2007:SDS,
  author =       "Hongbo Rong and Zhizhong Tang and R. Govindarajan and
                 Alban Douillet and Guang R. Gao",
  title =        "Single-dimension software pipelining for
                 multidimensional loops",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sasanka:2007:AES,
  author =       "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and
                 Yen-Kuang Chen and Eric Debes",
  title =        "{ALP}: {Efficient} support for all levels of
                 parallelism for complex media applications",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Soteriou:2007:SDP,
  author =       "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh",
  title =        "Software-directed power-aware interconnection
                 networks",
  journal =      j-TACO,
  volume =       "4",
  number =       "1",
  pages =        "??--??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Apr 14 10:44:57 MDT 2007",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bower:2007:ODH,
  author =       "Fred A. Bower and Daniel J. Sorin and Sule Ozev",
  title =        "Online diagnosis of hard faults in microprocessors",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250728",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We develop a microprocessor design that tolerates hard
                 faults, including fabrication defects and in-field
                 faults, by leveraging existing microprocessor
                 redundancy. To do this, we must: detect and correct
                 errors, diagnose hard faults at the field
                 deconfigurable unit (FDU) granularity, and deconfigure
                 FDUs with hard faults. In our reliable microprocessor
                 design, we use DIVA dynamic verification to detect and
                 correct errors. Our new scheme for diagnosing hard
                 faults tracks instructions' core structure occupancy
                 from decode until commit. If a DIVA checker detects an
                 error in an instruction, it increments a small
                 saturating error counter for every FDU used by that
                 instruction, including that DIVA checker. A hard fault
                 in an FDU quickly leads to an above-threshold error
                 counter for that FDU and thus diagnoses the fault. For
                 deconfiguration, we use previously developed schemes
                 for functional units and buffers and present a scheme
                 for deconfiguring DIVA checkers. Experimental results
                 show that our reliable microprocessor quickly and
                 accurately diagnoses each hard fault that is injected
                 and continues to function, albeit with somewhat
                 degraded performance.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "fine-grained diagnosis; hard fault tolerance;
                 processor microarchitecture",
}

@Article{Michaud:2007:STM,
  author =       "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis
                 and Yiannakis Sazeides and Theofanis Constantinou",
  title =        "A study of thread migration in temperature-constrained
                 multicores",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250729",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Temperature has become an important constraint in
                 high-performance processors, especially multicores.
                 Thread migration will be essential to exploit the full
                 potential of future thermally constrained multicores.
                 We propose and study a thread migration method that
                 maximizes performance under a temperature constraint,
                 while minimizing the number of migrations and ensuring
                 fairness between threads. We show that thread migration
                 brings important performance gains and that it is most
                 effective during the first tens of seconds following a
                 decrease of the number of running threads.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "multicore processor; power density; temperature;
                 thermal management; thread migration",
}

@Article{Chen:2007:CRL,
  author =       "Yu Chen and Fuxin Zhang",
  title =        "Code reordering on limited branch offset",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250730",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Since the 1980's code reordering has gained popularity
                 as an important way to improve the spatial locality of
                 programs. While the effect of the processor's
                 microarchitecture and memory hierarchy on this
                 optimization technique has been investigated, little
                 research has focused on the impact of the instruction
                 set. In this paper, we analyze the effect of limited
                 branch offset of the MIPS-like instruction set [Hwu et
                 al. 2004, 2005] on code reordering, explore two simple
                 methods to handle the exceeded branches, and propose
                 the bidirectional code layout (BCL) algorithm to reduce
                 the number of branches exceeding the offset limit. The
                 BCL algorithm sorts the chains according to the
                 position of related chains, avoids cache conflict
                 misses deliberately and lays out the code
                 bidirectionally. It strikes a balance among the
                 distance of related blocks, the instruction cache miss
                 rate, the memory size required, and the control flow
                 transfer. Experimental results show that BCL can
                 effectively reduce exceeded branches by 50.1\%, on
                 average, with up to 100\% for some programs. Except for
                 some programs with little spatial locality, the BCL
                 algorithm can achieve the performance, as the case with
                 no branch offset limitation.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "code reordering; Godson Processor; link-time
                 optimization",
}

@Article{Terechko:2007:ICC,
  author =       "A. S. Terechko and H. Corporaal",
  title =        "Inter-cluster communication in {VLIW} architectures",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250731",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The traditional VLIW (very long instruction word)
                 architecture with a single register file does not scale
                 up well to address growing performance demands on
                 embedded media processors. However, splitting a VLIW
                 processor in smaller clusters, which are comprised of
                 function units fully connected to local register files,
                 can significantly improve VLSI implementation
                 characteristics of the processor, such as speed, energy
                 consumption, and area. In our paper we reveal that
                 achieving the best characteristics of a clustered VLIW
                 requires a thorough selection of an Inter-cluster
                 Communication (ICC) model, which is the way clustering
                 is exposed in the Instruction Set Architecture. For our
                 study we, first, define a taxonomy of ICC models
                 including copy operations, dedicated issue slots,
                 extended operands, extended results, and multicast.
                 Evaluation of the execution time of the models requires
                 both the dynamic cycle count and clock period. We
                 developed an advanced instruction scheduler for all the
                 five ICC models in order to quantify the dynamic cycle
                 counts of our multimedia C benchmarks. To assess the
                 clock period of the ICC models we designed and laid out
                 VLIW datapaths using the RTL hardware descriptions
                 derived from a deeply pipelined commercial TriMedia
                 processor. In contrast to prior art, our research shows
                 that fully distributed register file architectures
                 (with eight clusters in our study) often underperform
                 compared to moderately clustered machines with two or
                 four clusters because of explosion of the cycle count
                 overhead in the former. Among the evaluated ICC models,
                 performance of the copy operation model, popular both
                 in academia and industry, is severely limited by the
                 copy operations hampering scheduling of regular
                 operations in high ILP (instruction-level parallelism)
                 code. The dedicated issue slots model combats this
                 limitation by dedicating extra VLIW issue slots purely
                 for ICC, reaching the highest 1.74 execution time
                 speedup relative to the unicluster. Furthermore, our
                 VLSI experiments show that the lowest area and energy
                 consumption of 42 and 57\% relative to the unicluster,
                 respectively, are achieved by the extended operands
                 model, which, nevertheless, provides higher performance
                 than the copy operation model.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "clock frequency; cluster assignment; instruction-level
                 parallelism; instruction scheduler; intercluster
                 communication; optimizing compiler; pipelining;
                 register allocation; VLIW",
}

@Article{Dou:2007:CCM,
  author =       "Jialin Dou and Marcelo Cintra",
  title =        "A compiler cost model for speculative
                 parallelization",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250732",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Speculative parallelization is a technique that allows
                 code sections that cannot be fully analyzed by the
                 compiler to be aggressively executed in parallel.
                 However, while speculative parallelization can
                 potentially deliver significant speedups, several
                 overheads associated with this technique can limit
                 these speedups in practice. This paper proposes a novel
                 compiler static cost model of speculative multithreaded
                 execution that can be used to predict the resulting
                 performance. This model attempts to predict the
                 expected speedups, or slowdowns, of the candidate
                 speculative sections based on the estimation of the
                 combined runtime effects of various overheads, and
                 taking into account the scheduling restrictions of most
                 speculative execution environments. The model is based
                 on estimating the likely execution duration of threads
                 and considers all the possible permutations of these
                 threads. This model also produces a quantitative
                 estimate of the speedup, which is different from prior
                 heuristics that only qualitatively estimate the
                 benefits of speculative multithreaded execution. In
                 previous work, a limited version of the framework was
                 evaluated on a number of loops from a collection of
                 SPEC benchmarks that suffer mainly from load imbalance
                 and thread dispatch and commit overheads. In this work,
                 an extended framework is also evaluated on loops that
                 may suffer from data-dependence violations.
                 Experimental results show that prediction accuracy is
                 lower when loops with violations are included.
                 Nevertheless, accuracy is still very high for a static
                 model: the framework can identify, on average, 45\% of
                 the loops that cause slowdowns and, on average, 96\% of
                 the loops that lead to speedups; it predicts the
                 speedups or slowdowns with an error of less than 20\%
                 for an average of 28\% of the loops across the
                 benchmarks and with an error of less than 50\% for an
                 average of 80\% of the loops. Overall, the framework
                 often outperforms, by as much as 25\%, a naive approach
                 that attempts to speculatively parallelize all the
                 loops considered, and is able to curb the large
                 slowdowns caused in many cases by this naive
                 approach.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "speculative multithreading; speculative
                 parallelization; thread-level speculation",
}

@Article{Amme:2007:SBM,
  author =       "Wolfram Amme and Jeffery von Ronne and Michael
                 Franz",
  title =        "{SSA}-based mobile code: {Implementation} and
                 empirical evaluation",
  journal =      j-TACO,
  volume =       "4",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1250727.1250733",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:40:54 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Although one might expect transportation formats based
                 on static single-assignment form (SSA) to yield faster
                 just-in-time compilation times than those based on
                 stack-based virtual machines, this claim has not
                 previously been validated, in practice. We attempt to
                 quantify the effect of using an SSA-based mobile code
                 representation by integrating support for a verifiable
                 SSA-based IR into Jikes RVM. Performance results,
                 measured with various optimizations and on both the
                 IA32 and PowerPC, show improvements in both compilation
                 time and code quality.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "SafeTSA; static single-assignment form; virtual
                 machines",
}

@Article{Li:2007:CCE,
  author =       "Xiaodong Li and Ritu Gupta and Sarita V. Adve and
                 Yuanyuan Zhou",
  title =        "Cross-component energy management: {Joint} adaptation
                 of processor and memory",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275938",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Researchers have proposed the use of adaptation to
                 reduce the energy consumption of different hardware
                 components, such as the processor, memory, disk, and
                 display for general-purpose applications. Previous
                 algorithms to control these adaptations, however, have
                 focused on a single component. This work takes the
                 first step toward developing algorithms that can
                 jointly control adaptations in multiple interacting
                 components for general-purpose applications, with the
                 goal of minimizing the total energy consumed within a
                 specified performance loss. Specifically, we develop a
                 joint-adaptation algorithm for processor and memory
                 adaptations. We identify two properties that enable
                 per-component algorithms to be easily used in a
                 cross-component context---the algorithms' performance
                 impact must be guaranteed and composable. We then
                 modify a current processor and a memory algorithm to
                 obey these properties. This allows the cross-component
                 problem to be reduced to determine an appropriate
                 (energy-optimal) allocation of the target performance
                 loss (slack) between the two components. We develop
                 such an optimal slack allocation algorithm that
                 exploits the above properties. The result is an
                 efficient cross-component adaptation framework that
                 minimizes the total energy of the processor and memory
                 without exceeding the target performance loss, while
                 substantially leveraging current per-component
                 algorithms. Our experiments show that joint processor
                 and memory adaptation provides significantly more
                 energy savings than adapting either component alone;
                 intelligent slack distribution is specifically
                 effective for highly compute- or memory-intensive
                 applications; and the performance slowdown never
                 exceeds the specification.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "adaptive systems; control algorithms; energy
                 management; low-power design; memory; performance
                 guarantee; processor",
}

@Article{Gabor:2007:FES,
  author =       "Ron Gabor and Shlomo Weiss and Avi Mendelson",
  title =        "Fairness enforcement in switch on event
                 multithreading",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275939",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The need to reduce power and complexity will increase
                 the interest in Switch On Event multithreading
                 (coarse-grained multithreading). Switch On Event
                 multithreading is a low-power and low-complexity
                 mechanism to improve processor throughput by switching
                 threads on execution stalls. Fairness may, however,
                 become a problem in a multithreaded processor. Unless
                 fairness is properly handled, some threads may starve
                 while others consume all of the processor cycles.
                 Heuristics that were devised in order to improve
                 fairness in simultaneous multithreading are not
                 applicable to Switch On Event multithreading. This
                 paper defines the fairness metric using the ratio of
                 the individual threads' speedups and shows how it can
                 be enforced in Switch On Event multithreading. Fairness
                 is controlled by forcing additional thread switch
                 points. These switch points are determined dynamically
                 by runtime estimation of the single threaded
                 performance of each of the individual threads. We
                 analyze the impact of the fairness enforcement
                 mechanism on aggregate IPC and weighted speedup. We
                 present simulation results of the performance of Switch
                 On Event multithreading. Switch On Event multithreading
                 achieves an average aggregate IPC increase of 26\% over
                 single thread and 12\% weighted speedup when no
                 fairness is enforced. In this case, a sixth of our runs
                 resulted in poor fairness in which one thread ran
                 extremely slowly (10 to 100 times slower than its
                 single-thread performance), while the other thread's
                 performance was hardly affected. By using the proposed
                 mechanism, we can guarantee fairness at different
                 levels of strictness and, in most cases, even improve
                 the weighted speedup.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "coarse-grained multithreading; fairness;
                 multithreading; performance; SOE; Switch on Event
                 multithreading; throughput; weighted speedup",
}

@Article{Andrade:2007:PAA,
  author =       "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
                 Doallo",
  title =        "Precise automatable analytical modeling of the cache
                 behavior of codes with indirections",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275940",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The performance of memory hierarchies, in which caches
                 play an essential role, is critical in nowadays
                 general-purpose and embedded computing systems because
                 of the growing memory bottleneck problem.
                 Unfortunately, cache behavior is very unstable and
                 difficult to predict. This is particularly true in the
                 presence of irregular access patterns, which exhibit
                 little locality. Such patterns are very common, for
                 example, in applications in which pointers or
                 compressed sparse matrices give place to indirections.
                 Nevertheless, cache behavior in the presence of
                 irregular access patterns has not been widely studied.
                 In this paper we present an extension of a systematic
                 analytical modeling technique based on PMEs
                 (probabilistic miss equations), previously developed by
                 the authors, that allows the automated analysis of the
                 cache behavior for codes with irregular access patterns
                 resulting from indirections. The model generates very
                 accurate predictions despite the irregularities and has
                 very low computing requirements, being the first model
                 that gathers these desirable characteristics that can
                 automatically analyze this kind of codes. These
                 properties enable this model to help drive compiler
                 optimizations, as we show with an example.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "analytical modeling; irregular access patterns; memory
                 hierarchy; performance prediction",
}

@Article{Venstermans:2007:JOH,
  author =       "Kris Venstermans and Lieven Eeckhout and Koen {De
                 Bosschere}",
  title =        "{Java} object header elimination for reduced memory
                 consumption in 64-bit virtual machines",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275941",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory performance is an important design issue for
                 contemporary computer systems given the huge
                 processor/memory speed gap. This paper proposes a
                 space-efficient Java object model for reducing the
                 memory consumption of 64-bit Java virtual machines. We
                 completely eliminate the object header through typed
                 virtual addressing (TVA) or implicit typing. TVA
                 encodes the object type in the object's virtual address
                 by allocating all objects of a given type in a
                 contiguous memory segment. This allows for removing the
                 type information as well as the status field from the
                 object header. Whenever type and status information is
                 needed, masking is applied to the object's virtual
                 address for obtaining an offset into type and status
                 information structures. Unlike previous work on
                 implicit typing, we apply TVA to a selected number of
                 frequently allocated object types, hence, the name
                 selective TVA (STVA); this limits the amount of memory
                 fragmentation. In addition to applying STVA, we also
                 compress the type information block (TIB) pointers for
                 all objects that do not fall under TVA. We implement
                 the space-efficient Java object model in the 64-bit
                 version of the Jikes RVM on an AIX IBM platform and
                 compare its performance against the traditionally used
                 Java object model using a multitude of Java benchmarks.
                 We conclude that the space-efficient Java object model
                 reduces memory consumption by on average 15\% (and up
                 to 45\% for some benchmarks). About one-half the
                 reduction comes from TIB pointer compression; the other
                 one-half comes from STVA. In terms of performance, the
                 space-efficient object model generally does not affect
                 performance; however, for some benchmarks we observe
                 statistically significant performance speedups, up to
                 20\%.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "64-bit implementation; implicit typing; Java object
                 model; typed virtual addressing; Virtual machine",
}

@Article{Xiao:2007:VIS,
  author =       "Shu Xiao and Edmund M.-K. Lai",
  title =        "{VLIW} instruction scheduling for minimal power
                 variation",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275942",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The focus of this paper is on the minimization of the
                 variation in power consumed by a VLIW processor during
                 the execution of a target program through instruction
                 scheduling. The problem is formulated as a
                 mixed-integer program (MIP) and a problem-specific
                 branch-and-bound algorithm has been developed to solve
                 it more efficiently than generic MIP solvers.
                 Simulation results based on the TMS320C6711 VLIW
                 digital signal processor using benchmarks from
                 Mediabench and Trimaran showed that over 40\% average
                 reduction in power variation can be achieved without
                 sacrificing execution speed of these benchmarks.
                 Computational requirements and convergence rates of our
                 algorithm are also analyzed.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "instruction scheduling; power variation reduction;
                 VLIW processors",
}

@Article{Tallam:2007:UCF,
  author =       "Sriraman Tallam and Rajiv Gupta",
  title =        "Unified control flow and data dependence traces",
  journal =      j-TACO,
  volume =       "4",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2007",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1275937.1275943",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:20 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We describe the design, generation, and compression of
                 the extended whole program path (eWPP), representation
                 that not only captures the control flow history of a
                 program execution but also its data dependence history.
                 This representation is motivated by the observation
                 that, typically, a significant fraction of data
                 dependence history can be recovered from the control
                 flow trace. To capture the remainder of the data
                 dependence history, we introduce disambiguation checks
                 in the program whose control flow signatures capture
                 the results of the checks. The resulting extended
                 control flow trace enables the recovery of otherwise
                 irrecoverable data dependences. The code for the checks
                 is designed to minimize the increase in program
                 execution time and the extended control flow trace size
                 when compared to directly collecting control flow and
                 address traces. Our experiments show that compressed
                 eWPPs are only one-quarter of the size of combined
                 compressed control flow and address traces. However,
                 their collection incurs a 5{\times} increase in runtime
                 overhead relative to the overhead required for directly
                 collecting the control flow and address traces,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "address trace; control flow trace; dynamic data
                 dependence trace; profiling",
}

@Article{Ipek:2008:EAD,
  author =       "Engin Ipek and Sally A. McKee and Karan Singh and Rich
                 Caruana and Bronis R. de Supinski and Martin Schulz",
  title =        "Efficient architectural design space exploration via
                 predictive modeling",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328196",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Efficiently exploring exponential-size architectural
                 design spaces with many interacting parameters remains
                 an open problem: the sheer number of experiments
                 required renders detailed simulation intractable. We
                 attack this via an automated approach that builds
                 accurate predictive models. We simulate sampled points,
                 using results to teach our models the function
                 describing relationships among design parameters. The
                 models can be queried and are very fast, enabling
                 efficient design tradeoff discovery. We validate our
                 approach via two uniprocessor sensitivity studies,
                 predicting IPC with only 1--2\% error. In an
                 experimental study using the approach, training on 1\%
                 of a 250-K-point CMP design space allows our models to
                 predict performance with only 4--5\% error. Our
                 predictive modeling combines well with techniques that
                 reduce the time taken by each simulation experiment,
                 achieving net time savings of three-four orders of
                 magnitude.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "artificial neural networks; design space exploration;
                 performance prediction; sensitivity studies",
}

@Article{Shi:2008:VMS,
  author =       "Yunhe Shi and Kevin Casey and M. Anton Ertl and David
                 Gregg",
  title =        "Virtual machine showdown: {Stack} versus registers",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328197",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Virtual machines (VMs) enable the distribution of
                 programs in an architecture-neutral format, which can
                 easily be interpreted or compiled. A long-running
                 question in the design of VMs is whether a stack
                 architecture or register architecture can be
                 implemented more efficiently with an interpreter. We
                 extend existing work on comparing virtual stack and
                 virtual register architectures in three ways. First,
                 our translation from stack to register code and
                 optimization are much more sophisticated. The result is
                 that we eliminate an average of more than 46\% of
                 executed VM instructions, with the bytecode size of the
                 register machine being only 26\% larger than that of
                 the corresponding stack one. Second, we present a fully
                 functional virtual-register implementation of the Java
                 virtual machine (JVM), which supports Intel, AMD64,
                 PowerPC and Alpha processors. This register VM supports
                 inline-threaded, direct-threaded, token-threaded, and
                 switch dispatch. Third, we present experimental results
                 on a range of additional optimizations such as register
                 allocation and elimination of redundant heap loads. On
                 the AMD64 architecture the register machine using
                 switch dispatch achieves an average speedup of 1.48
                 over the corresponding stack machine. Even using the
                 more efficient inline-threaded dispatch, the register
                 VM achieves a speedup of 1.15 over the equivalent
                 stack-based VM.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "interpreter; register architecture; stack
                 architecture; virtual machine",
}

@Article{Yan:2008:EVR,
  author =       "Jun Yan and Wei Zhang",
  title =        "Exploiting virtual registers to reduce pressure on
                 real registers",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328198",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "It is well known that a large fraction of variables
                 are short-lived. This paper proposes a novel approach
                 to exploiting this fact to reduce the register pressure
                 for pipelined processors with data-forwarding network.
                 The idea is that the compiler can allocate virtual
                 registers (i.e., place holders to identify dependences
                 among instructions) to short-lived variables, which do
                 not need to be stored to physical storage locations. As
                 a result, real registers (i.e., physically existed
                 registers) can be reserved for long-lived variables for
                 mitigating the register pressure and decreasing the
                 register spills, leading to performance improvement. In
                 this paper, we develop the architectural and compiler
                 support for exploiting virtual registers for statically
                 scheduled processors. Our experimental results show
                 that virtual registers are very effective at reducing
                 the register spills, which, in many cases, can achieve
                 the performance close to the processor with twice
                 number of real registers. Our results also indicate
                 that, for some applications, using 24 virtual, in
                 addition to 8 real registers, can attain even higher
                 performance than that of 16 real without any virtual
                 registers.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "data forwarding; register allocation; register file;
                 short-lived variables; virtual register",
}

@Article{Yu:2008:OCL,
  author =       "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang",
  title =        "Object co-location and memory reuse for {Java}
                 programs",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328199",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We introduce a new memory management system, STEMA,
                 which can improve the execution time of Java programs.
                 STEMA detects prolific types on-the-fly and co-locates
                 their objects in a special memory space which supports
                 reuse of memory. We argue and show that memory reuse
                 and co-location of prolific objects can result in
                 improved cache locality, reduced memory fragmentation,
                 reduced GC time, and faster object allocation. We
                 evaluate STEMA using 16 benchmarks. Experimental
                 results show that STEMA performs 2.7\%, 4.0\%, and
                 8.2\% on average better than MarkSweep, CopyMS, and
                 SemiSpace.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "garbage collector; Java; memory allocator; memory
                 reuse; mutator; object co-location",
}

@Article{Zhang:2008:RCM,
  author =       "Chuanjun Zhang",
  title =        "Reducing cache misses through programmable decoders",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328200",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Level-one caches normally reside on a processor's
                 critical path, which determines clock frequency.
                 Therefore, fast access to level-one cache is important.
                 Direct-mapped caches exhibit faster access time, but
                 poor hit rates, compared with same sized
                 set-associative caches because of nonuniform accesses
                 to the cache sets. The nonuniform accesses generate
                 more cache misses in some sets, while other sets are
                 underutilized. We propose to increase the decoder
                 length and, hence, reduce the accesses to heavily used
                 sets without dynamically detecting the cache set usage
                 information. We increase the access to the
                 underutilized cache sets by incorporating a replacement
                 policy into the cache design using programmable
                 decoders. On average, the proposed techniques achieve
                 as low a miss rate as a traditional 4-way cache on all
                 26 SPEC2K benchmarks for the instruction and data
                 caches, respectively. This translates into an average
                 IPC improvement of 21.5 and 42.4\% for SPEC2K integer
                 and floating-point benchmarks, respectively. The
                 B-Cache consumes 10.5\% more power per access, but
                 exhibits a 12\% total memory access-related energy
                 savings as a result of the miss rate reductions, and,
                 hence, the reduction to applications' execution time.
                 Compared with previous techniques that aim at reducing
                 the miss rate of direct-mapped caches, our technique
                 requires only one cycle to access all cache hits and
                 has the same access time of a direct-mapped cache.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache; dynamic optimization; low power",
}

@Article{Golander:2008:HMP,
  author =       "Amit Golander and Shlomo Weiss",
  title =        "Hiding the misprediction penalty of a
                 resource-efficient high-performance processor",
  journal =      j-TACO,
  volume =       "4",
  number =       "4",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1328195.1328201",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:35 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Misprediction is a major obstacle for increasing
                 speculative out-of-order processors performance.
                 Performance degradation depends on both the number of
                 misprediction events and the recovery time associated
                 with each one of them. In recent years a few checkpoint
                 based microarchitectures have been proposed. In
                 comparison with ROB-based processors, checkpoint
                 processors are scalable and highly resource efficient.
                 Unfortunately, in these proposals the misprediction
                 recovery time is proportional to the instruction queue
                 size.\par

                 In this paper we analyze methods to reduce the
                 misprediction recovery time. We propose a new register
                 file management scheme and techniques to selectively
                 flush the instruction queue and the load store queue,
                 and to isolate deeply pipelined execution units. The
                 result is a novel checkpoint processor with Constant
                 misprediction RollBack time (CRB). We further present a
                 streamlined, cost-efficient solution, which saves
                 complexity at the price of slightly lower
                 performance.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "checkpoints; misprediction; out-of-order execution;
                 rollback; scalable architecture",
}

@Article{Calder:2008:E,
  author =       "Brad Calder and Dean Tullsen",
  title =        "Editorial",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369397",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mysore:2008:FIP,
  author =       "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber
                 and Timothy Sherwood and Nisheeth Shrivastava and
                 Subhash Suri",
  title =        "Formulating and implementing profiling over adaptive
                 ranges",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369398",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern computer systems are called on to deal with
                 billions of events every second, whether they are
                 executed instructions, accessed memory locations, or
                 forwarded packets. This presents a serious challenge to
                 those who seek to quantify, analyze, or optimize such
                 systems, because important trends and behaviors may
                 easily be lost in a sea of data. We present
                 range-adaptive profiling (RAP) as a new and
                 general-purpose profiling method capable of
                 hierarchically efficiently classifying streams of data
                 in hardware. Through the use of RAP, events in an input
                 stream are dynamically classified into increasingly
                 precise categories, based on the frequency with which
                 they occur. The more important a class, or range of
                 events, the more precisely it is quantified. Despite
                 the dynamic nature of our technique, we build upon
                 tight theoretic bounds covering both worst-case error,
                 as well as the required memory. In the limit, it is
                 known that error and the memory bounds can be
                 independent of the stream size and grow only linearly
                 with the level of precision desired. Significantly, we
                 expose the critical constants in these algorithms and
                 through careful engineering, algorithm redesign, and
                 use of heuristics, we show how a high-performance
                 profile system can be implemented for range-adaptive
                 profiling. RAP can be used on various profiles, such as
                 PCs, load values, and memory addresses, and has a broad
                 range of uses, from hot-region profiling to quantifying
                 cache miss value locality. We propose two methods of
                 implementation of RAP, one in software and the other
                 with specialized hardware, for which we also describe
                 our prototype FPGA implementation. We show that with
                 just 8KB of memory, range profiles can be gathered with
                 an average accuracy of 98\%.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "profiling hardware; range adaptive; value locality",
}

@Article{Zhai:2008:CHS,
  author =       "Antonia Zhai and J. Gregory Steffan and Christopher B.
                 Colohan and Todd C. Mowry",
  title =        "Compiler and hardware support for reducing the
                 synchronization of speculative threads",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369399",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Thread-level speculation (TLS) allows us to
                 automatically parallelize general-purpose programs by
                 supporting parallel execution of threads that might not
                 actually be independent. In this article, we focus on
                 one important limitation of program performance under
                 TLS, which stalls as a result of synchronizing and
                 forwarding scalar values between speculative threads
                 that would otherwise cause frequent data dependences
                 and, hence, failed speculation. Using SPECint
                 benchmarks that have been automatically transformed by
                 our compiler to exploit TLS, we present, evaluate in
                 detail, and compare both compiler and hardware
                 techniques for improving the communication of scalar
                 values. We find that through our dataflow algorithms
                 for three increasingly aggressive instruction
                 scheduling techniques, the compiler can drastically
                 reduce the critical forwarding path introduced by the
                 synchronization and forwarding of scalar values. We
                 also show that hardware techniques for reducing
                 synchronization can be complementary to compiler
                 scheduling, but that the additional performance
                 benefits are minimal and are generally not worth the
                 cost.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "automatic parallelization; chip-multiprocessing;
                 instruction scheduling; thread-level speculation",
}

@Article{Winter:2008:ATN,
  author =       "Jonathan A. Winter and David H. Albonesi",
  title =        "Addressing thermal nonuniformity in {SMT} workloads",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369400",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We explore DTM techniques within the context of
                 uniform and nonuniform SMT workloads. While DVS is
                 suitable for addressing workloads with uniformly high
                 temperatures, for nonuniform workloads, performance
                 loss occurs because of the slowdown of the cooler
                 thread. To address this, we propose and evaluate DTM
                 mechanisms that exploit the steering-based thread
                 management mechanisms inherent in a clustered SMT
                 architecture. We show that in contrast to DVS, which
                 operates globally, our techniques are more effective at
                 controlling temperature for nonuniform workloads.
                 Furthermore, we devise a DTM technique that combines
                 steering and DVS to achieve consistently good
                 performance across all workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "adaptive microarchitectures; clustered
                 microarchitectures; dynamic thermal management; dynamic
                 voltage scaling; simultaneous multithreading",
}

@Article{Shahbahrami:2008:VES,
  author =       "Asadollah Shahbahrami and Ben Juurlink and Stamatis
                 Vassiliadis",
  title =        "Versatility of extended subwords and the matrix
                 register file",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369401",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Extended subwords and the matrix register file (MRF)
                 are two micro architectural techniques that address
                 some of the limitations of existing SIMD architectures.
                 Extended subwords are wider than the data stored in
                 memory. Specifically, for every byte of data stored in
                 memory, there are four extra bits in the media register
                 file. This avoids the need for data-type conversion
                 instructions. The MRF is a register file organization
                 that provides both conventional row-wise, as well as
                 column-wise, access to the register file. In other
                 words, it allows to view the register file as a matrix
                 in which corresponding subwords in different registers
                 corresponds to a column of the matrix. It was
                 introduced to accelerate matrix transposition which is
                 a very common operation in multimedia applications. In
                 this paper, we show that the MRF is very versatile,
                 since it can also be used for other permutations than
                 matrix transposition. Specifically, it is shown how it
                 can be used to provide efficient access to strided
                 data, as is needed in, e.g., color space conversion.
                 Furthermore, it is shown that special-purpose
                 instructions (SPIs), such as the sum-of-absolute
                 differences (SAD) instruction, have limited usefulness
                 when extended subwords and a few general SIMD
                 instructions that we propose are supported, for the
                 following reasons. First, when extended subwords are
                 supported, the SAD instruction provides only a
                 relatively small performance improvement. Second, the
                 SAD instruction processes 8-bit subwords only, which is
                 not sufficient for quarter-pixel resolution nor for
                 cost functions used in image and video retrieval.
                 Results obtained by extending the SimpleScalar toolset
                 show that the proposed techniques provide a speedup of
                 up to 3.00 over the MMX architecture. The results also
                 show that using, at most, 13 extra media registers
                 yields an additional performance improvement ranging
                 from 1.3 to 1.57.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "multimedia standards; SIMD architectures; SIMD
                 programming",
}

@Article{Guo:2008:EHC,
  author =       "Zhi Guo and Walid Najjar and Betul Buyukkurt",
  title =        "Efficient hardware code generation for {FPGAs}",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1369402",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The wider acceptance of FPGAs as a computing device
                 requires a higher level of programming abstraction.
                 ROCCC is an optimizing C to HDL compiler. We describe
                 the code generation approach in ROCCC. The smart buffer
                 is a component that reuses input data between adjacent
                 iterations. It significantly improves the performance
                 of the circuit and simplifies loop control. The
                 ROCCC-generated datapath can execute one loop iteration
                 per clock cycle when there is no loop dependency or
                 there is only scalar recurrence variable dependency.
                 ROCCC's approach to supporting while-loops operating on
                 scalars makes the compiler able to move scalar
                 iterative computation into hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "data reuse; FPGA; high-level synthesis; reconfigurable
                 computing; VHDL",
}

@Article{Kotzmann:2008:DJH,
  author =       "Thomas Kotzmann and Christian Wimmer and Hanspeter
                 M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth
                 Russell and David Cox",
  title =        "Design of the {Java HotSpot\TM} client compiler for
                 {Java 6}",
  journal =      j-TACO,
  volume =       "5",
  number =       "1",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1369396.1370017",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jun 16 11:41:51 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM
                 ships with a redesigned version of the client
                 just-in-time compiler that includes several research
                 results of the last years. The client compiler is at
                 the heart of the VM configuration used by default for
                 interactive desktop applications. For such
                 applications, low startup and pause times are more
                 important than peak performance. This paper outlines
                 the new architecture of the client compiler and shows
                 how it interacts with the VM. It presents the
                 intermediate representation that now uses static
                 single-assignment (SSA) form and the linear scan
                 algorithm for global register allocation. Efficient
                 support for exception handling and deoptimization
                 fulfills the demands that are imposed by the dynamic
                 features of the Java programming language. The
                 evaluation shows that the new client compiler generates
                 better code in less time. The popular SPECjvm98
                 benchmark suite is executed 45\% faster, while the
                 compilation speed is also up to 40\% better. This
                 indicates that a carefully selected set of global
                 optimizations can also be integrated in just-in-time
                 compilers that focus on compilation speed and not on
                 peak performance. In addition, the paper presents the
                 impact of several optimizations on execution and
                 compilation speed. As the source code is freely
                 available, the Java HotSpot{\TM} VM and the client
                 compiler are the ideal basis for experiments with new
                 feedback-directed optimizations in a production-level
                 Java just-in-time compiler. The paper outlines research
                 projects that add fast algorithms for escape analysis,
                 automatic object inlining, and array bounds check
                 elimination.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "compiler; deoptimization; intermediate representation;
                 Java; just-in-time compilation; optimization; register
                 allocation",
}

@Article{Rangan:2008:PSD,
  author =       "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
                 and David I. August",
  title =        "Performance scalability of decoupled software
                 pipelining",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1400112.1400113",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Any successful solution to using multicore processors
                 to scale general-purpose program performance will have
                 to contend with rising intercore communication costs
                 while exposing coarse-grained parallelism. Recently
                 proposed pipelined multithreading (PMT) techniques have
                 been demonstrated to have general-purpose applicability
                 and are also able to effectively tolerate inter-core
                 latencies through pipelined interthread communication.
                 These desirable properties make PMT techniques strong
                 candidates for program parallelization on current and
                 future multicore processors and understanding their
                 performance characteristics is critical to their
                 deployment. To that end, this paper evaluates the
                 performance scalability of a general-purpose PMT
                 technique called decoupled software pipelining (DSWP)
                 and presents a thorough analysis of the communication
                 bottlenecks that must be overcome for optimal DSWP
                 scalability.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "decoupled software pipelining; performance analysis",
}

@Article{Long:2008:TMM,
  author =       "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and
                 Rajarshi Mukherjee",
  title =        "Thermal monitoring mechanisms for chip
                 multiprocessors",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1400112.1400114",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With large-scale integration and increasing power
                 densities, thermal management has become an important
                 tool to maintain performance and reliability in modern
                 process technologies. In the core of dynamic thermal
                 management schemes lies accurate reading of on-die
                 temperatures. Therefore, careful planning and embedding
                 of thermal monitoring mechanisms into high-performance
                 systems becomes crucial. In this paper, we propose
                 three techniques to create sensor infrastructures for
                 monitoring the maximum temperature on a multicore
                 system. Initially, we extend a nonuniform sensor
                 placement methodology proposed in the literature to
                 handle chip multiprocessors (CMPs) and show its
                 limitations. We then analyze a grid-based approach
                 where the sensors are placed on a static grid covering
                 each core and show that the sensor readings can differ
                 from the actual maximum core temperature by as much as
                 12.6^\circ C when using 16 sensors per core. Also, as
                 large as 10.6\% of the thermal emergencies are not
                 captured using the same number of sensors. Based on
                 this observation, we first develop an interpolation
                 scheme, which estimates the maximum core temperature
                 through interpolation of the readings collected at the
                 static grid points. We show that the interpolation
                 scheme improves the measurement accuracy and emergency
                 coverage compared to grid-based placement when using
                 the same number of sensors. Second, we present a
                 dynamic scheme where only a subset of the sensor
                 readings is collected to predict the maximum
                 temperature of each core. Our results indicate that, we
                 can reduce the number of active sensors by as much as
                 50\%, while maintaining similar measurement accuracy
                 and emergency coverage compared to the case where the
                 entire sensor set on the grid is sampled at all
                 times.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "nonuniform and uniform sensor placement; thermal
                 sensor allocation",
}

@Article{Joshi:2008:DEP,
  author =       "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell,
                 Jr.} and Lizy K. John",
  title =        "Distilling the essence of proprietary workloads into
                 miniature benchmarks",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1400112.1400115",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Benchmarks set standards for innovation in computer
                 architecture research and industry product development.
                 Consequently, it is of paramount importance that these
                 workloads are representative of real-world
                 applications. However, composing such representative
                 workloads poses practical challenges to application
                 analysis teams and benchmark developers (1) real-world
                 workloads are intellectual property and vendors
                 hesitate to share these proprietary applications; and
                 (2) porting and reducing these applications to
                 benchmarks that can be simulated in a tractable amount
                 of time is a nontrivial task. In this paper, we address
                 this problem by proposing a technique that
                 automatically distills key inherent behavioral
                 attributes of a proprietary workload and captures them
                 into a miniature synthetic benchmark clone. The
                 advantage of the benchmark clone is that it hides the
                 functional meaning of the code but exhibits similar
                 performance characteristics as the target application.
                 Moreover, the dynamic instruction count of the
                 synthetic benchmark clone is substantially shorter than
                 the proprietary application, greatly reducing overall
                 simulation time for SPEC CPU, the simulation time
                 reduction is over five orders of magnitude compared to
                 entire benchmark execution. Using a set of benchmarks
                 representative of general-purpose, scientific, and
                 embedded applications, we demonstrate that the power
                 and performance characteristics of the synthetic
                 benchmark clone correlate well with those of the
                 original application across a wide range of
                 microarchitecture configurations.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "benchmark cloning; benchmarks; workload
                 characterization",
}

@Article{Catania:2008:RCM,
  author =       "Vincenzo Catania and Maurizio Palesi and Davide
                 Patti",
  title =        "Reducing complexity of multiobjective design space
                 exploration in {VLIW}-based embedded systems",
  journal =      j-TACO,
  volume =       "5",
  number =       "2",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1400112.1400116",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Aug 28 13:25:00 MDT 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Architectures based on very-long instruction word
                 (VLIW) have found fertile ground in multimedia
                 electronic appliances thanks to their ability to
                 exploit high degrees of instruction level parallelism
                 (ILP) with a reasonable trade-off in complexity and
                 silicon cost. Specialization of such architectures
                 involves the configuration of both hardware-related
                 aspects (e.g., register files, functional units, memory
                 subsystem) and software-related issues (e.g., the
                 compilation strategy). The complex interactions between
                 the components of such systems will force a human
                 designer to rely on judgment and experience in
                 designing them, possibly eliminating interesting
                 configurations, and making tuning of the system, for
                 either power, energy, or performance, difficult. In
                 this paper we propose tools and methodologies to
                 efficiently cope with this complexity from a
                 multiobjective perspective. We first analyze the impact
                 of ILP-oriented code transformations using two
                 alternative compilation profiles to quantitatively show
                 the effect of such transformations on typical design
                 objectives like performance, power dissipation, and
                 energy consumption. Next, by means of statistical
                 analysis, we collect useful data to predict the
                 effectiveness of a given compilation profiles for a
                 specific application. Information gathered from such
                 analysis can be exploited to drastically reduce the
                 computational effort needed to perform the design space
                 exploration.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "design space exploration; energy; genetic algorithms;
                 hyperblock formation; ILP; multiobjective optimization;
                 performances; power; statistical analysis; VLIW
                 architectures",
}

@Article{Leverich:2008:CEM,
  author =       "Jacob Leverich and Hideho Arakida and Alex
                 Solomatnikov and Amin Firoozshahian and Mark Horowitz
                 and Christos Kozyrakis",
  title =        "Comparative evaluation of memory models for chip
                 multiprocessors",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1455650.1455651",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "There are two competing models for the on-chip memory
                 in Chip Multiprocessor (CMP) systems: {\em
                 hardware-managed coherent caches\/} and {\em
                 software-managed streaming memory}. This paper performs
                 a direct comparison of the two models under the same
                 set of assumptions about technology, area, and
                 computational capabilities. The goal is to quantify how
                 and when they differ in terms of performance, energy
                 consumption, bandwidth requirements, and latency
                 tolerance for general-purpose CMPs. We demonstrate that
                 for data-parallel applications on systems with up to 16
                 cores, the cache-based and streaming models perform and
                 scale equally well. For certain applications with
                 little data reuse, streaming scales better due to
                 better bandwidth use and macroscopic software
                 prefetching. However, the introduction of techniques
                 such as hardware prefetching and nonallocating stores
                 to the cache-based model eliminates the streaming
                 advantage. Overall, our results indicate that there is
                 not sufficient advantage in building streaming memory
                 systems where all on-chip memory structures are
                 explicitly managed. On the other hand, we show that
                 streaming at the programming model level is
                 particularly beneficial, even with the cache-based
                 model, as it enhances locality and creates
                 opportunities for bandwidth optimizations. Moreover, we
                 observe that stream programming is actually easier with
                 the cache-based model because the hardware guarantees
                 correct, best-effort execution even when the programmer
                 cannot fully regularize an application's code.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache coherence; Chip multiprocessors; locality
                 optimizations; parallel programming; streaming memory",
}

@Article{Sharkey:2008:RRP,
  author =       "Joseph J. Sharkey and Jason Loew and Dmitry V.
                 Ponomarev",
  title =        "Reducing register pressure in {SMT} processors through
                 {L2}-miss-driven early register release",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1455650.1455652",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The register file is one of the most critical datapath
                 components limiting the number of threads that can be
                 supported on a simultaneous multithreading (SMT)
                 processor. To allow the use of smaller register files
                 without degrading performance, techniques that maximize
                 the efficiency of using registers through aggressive
                 register allocation/deallocation can be considered. In
                 this article, we propose a novel technique to early
                 deallocate physical registers allocated to threads
                 which experience L2 cache misses. This is accomplished
                 by speculatively committing the load-independent
                 instructions and deallocating the registers
                 corresponding to the previous mappings of their
                 destinations, without waiting for the cache miss
                 request to be serviced. The early deallocated registers
                 are then made immediately available for allocation to
                 instructions within the same thread as well as within
                 other threads, thus improving the overall processor
                 throughput. On the average across the simulated mixes
                 of multiprogrammed SPEC 2000 workloads, our technique
                 results in 33\% improvement in throughput and 25\%
                 improvement in terms of harmonic mean of weighted IPCs
                 over the baseline SMT with the state-of-the-art DCRA
                 policy. This is achieved without creating checkpoints,
                 maintaining per-register counters of pending consumers,
                 performing tag rebroadcasts, register remappings,
                 and/or additional associative searches.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "register file; Simultaneous multithreading",
}

@Article{Mehrara:2008:ESP,
  author =       "Mojtaba Mehrara and Todd Austin",
  title =        "Exploiting selective placement for low-cost memory
                 protection",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "14:1--14:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1455650.1455653",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many embedded processing applications, such as those
                 found in the automotive or medical field, require
                 hardware designs that are at the same time low cost and
                 reliable. Traditionally, reliable memory systems have
                 been implemented using coded storage techniques, such
                 as ECC. While these designs can effectively detect and
                 correct memory faults such as transient errors and
                 single-bit defects, their use bears a significant cost
                 overhead. In this article, we propose a novel partial
                 memory protection scheme that provides high-coverage
                 fault protection for program code and data, but with
                 much lower cost than traditional approaches. Our
                 approach profiles program code and data usage to assess
                 which program elements are most critical to maintaining
                 program correctness. Critical code and variables are
                 then placed into a limited protected storage resources.
                 To ensure high coverage of program elements, our
                 placement technique considers all program components
                 simultaneously, including code, global variables, stack
                 frames, and heap variables. The fault coverage of our
                 approach is gauged using Monte Carlo fault-injection
                 experiments, which confirm that our technique provides
                 high levels of fault protection (99\% coverage) with
                 limited memory protection resources (36\% protected
                 area).",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "fault-tolerant design; memory system design; Partial
                 memory protection; selective placement; transient
                 faults",
}

@Article{Vandierendonck:2008:SRA,
  author =       "Hans Vandierendonck and Andr{\'e} Seznec",
  title =        "Speculative return address stack management
                 revisited",
  journal =      j-TACO,
  volume =       "5",
  number =       "3",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1455650.1455654",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Dec 8 14:28:18 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Branch prediction feeds a speculative execution
                 processor core with instructions. Branch mispredictions
                 are inevitable and have negative effects on performance
                 and energy consumption. With the advent of highly
                 accurate conditional branch predictors, nonconditional
                 branch instructions are gaining importance.\par

                 In this article, we address the prediction of procedure
                 returns. On modern processors, procedure returns are
                 predicted through a return address stack (RAS). The
                 overwhelming majority of the return mispredictions are
                 due to RAS overflows and/or overwriting the top entries
                 of the RAS on a mispredicted path. These sources of
                 misprediction were addressed by previously proposed
                 speculative return address stacks [Jourdan et al. 1996;
                 Skadron et al. 1998]. However, the remaining
                 misprediction rate of these RAS designs is still
                 significant when compared to state-of-the-art
                 conditional predictors.\par

                 We present two low-cost corruption detectors for RAS
                 predictors. They detect RAS overflows and wrong path
                 corruption with 100\% coverage. As a consequence, when
                 such a corruption is detected, another source can be
                 used for predicting the return. On processors featuring
                 a branch target buffer (BTB), this BTB can be used as a
                 free backup predictor for predicting returns when
                 corruption is detected.\par

                 Our experiments show that our proposal can be used to
                 improve the behavior of all previously proposed
                 speculative RASs. For instance, without any specific
                 management of the speculative states on the RAS, an
                 8-entry BTB-backed up RAS achieves the same performance
                 level as a state-of-the-art, but complex, 64-entry
                 self-checkpointing RAS [Jourdan et al. 1996].
                 Therefore, our proposal can be used either to improve
                 the performance of the processor or to reduce its
                 hardware complexity.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "back-up predictor; corruption detection; Return
                 address prediction",
}

@Article{Chhabra:2009:MSP,
  author =       "Siddhartha Chhabra and Brian Rogers and Yan Solihin
                 and Milos Prvulovic",
  title =        "Making secure processors {OS}- and
                 performance-friendly",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1498690.1498691",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In today's digital world, computer security issues
                 have become increasingly important. In particular,
                 researchers have proposed designs for secure processors
                 that utilize hardware-based memory encryption and
                 integrity verification to protect the privacy and
                 integrity of computation even from sophisticated
                 physical attacks. However, currently proposed schemes
                 remain hampered by problems that make them impractical
                 for use in today's computer systems: lack of virtual
                 memory and Inter-Process Communication support as well
                 as excessive storage and performance overheads. In this
                 article, we propose (1) address independent seed
                 encryption (AISE), a counter-mode-based memory
                 encryption scheme using a novel seed composition, and
                 (2) bonsai Merkle trees (BMT), a novel Merkle
                 tree-based memory integrity verification technique, to
                 eliminate these system and performance issues
                 associated with prior counter-mode memory encryption
                 and Merkle tree integrity verification schemes. We
                 present both a qualitative discussion and a
                 quantitative analysis to illustrate the advantages of
                 our techniques over previously proposed approaches in
                 terms of complexity, feasibility, performance, and
                 storage. Our results show that AISE+BMT reduces the
                 overhead of prior memory encryption and integrity
                 verification schemes from 12\% to 2\% on average for
                 single-threaded benchmarks on uniprocessor systems, and
                 from 15\% to 4\% for coscheduled benchmarks on
                 multicore systems while eliminating critical
                 system-level problems.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "memory encryption; memory integrity verification;
                 Secure processor architectures; virtualization",
}

@Article{Jimenez:2009:GNB,
  author =       "Daniel A. Jim{\'e}nez",
  title =        "Generalizing neural branch prediction",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1498690.1498692",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Improved branch prediction accuracy is essential to
                 sustaining instruction throughput with today's deep
                 pipelines. Traditional branch predictors exploit
                 correlations between pattern history and branch outcome
                 to predict branches, but there is a stronger and more
                 natural correlation between path history and branch
                 outcome. We explore the potential for exploiting this
                 correlation. We introduce {\em piecewise linear branch
                 prediction}, an idealized branch predictor that
                 develops a set of linear functions, one for each
                 program path to the branch to be predicted, that
                 separate predicted taken from predicted not taken
                 branches. Taken together, all of these linear functions
                 form a piecewise linear decision surface. We present a
                 limit study of this predictor showing its potential to
                 greatly improve predictor accuracy.\par

                 We then introduce a practical implementable branch
                 predictor based on piecewise linear branch prediction.
                 In making our predictor practical, we show how a
                 parameterized version of it unifies the previously
                 distinct concepts of perceptron prediction and
                 path-based neural prediction. Our new branch predictor
                 has implementation costs comparable to current
                 prominent predictors in the literature while
                 significantly improving accuracy. For a deeply
                 pipelined simulated microarchitecture our predictor
                 with a 256-KB hardware budget improves the harmonic
                 mean normalized instructions-per-cycle rate by 8\% over
                 both the original path-based neural predictor and
                 2Bc-{\em gskew}. The average misprediction rate is
                 decreased by 16\% over the path-based neural predictor
                 and by 22\% over 2Bc-{\em gskew}.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Branch prediction; machine learning",
}

@Article{Jeon:2009:AAP,
  author =       "Jinseong Jeon and Keoncheol Shin and Hwansoo Han",
  title =        "Abstracting access patterns of dynamic memory using
                 regular expressions",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1498690.1498693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Unless the speed gap between CPU and memory
                 disappears, efficient memory usage remains a decisive
                 factor for performance. To optimize data usage of
                 programs in the presence of the memory hierarchy, we
                 are particularly interested in two compiler techniques:
                 {\em pool allocation\/} and {\em field layout
                 restructuring}. Since foreseeing runtime behaviors of
                 programs at compile time is difficult, most of the
                 previous work relied on profiling. On the contrary, our
                 goal is to develop a fully automatic compiler that
                 statically transforms input codes to use memory
                 efficiently. Noticing that {\em regular expressions},
                 which denote repetition explicitly, are sufficient for
                 memory access patterns, we describe how to extract
                 memory access patterns as regular expressions in
                 detail. Based on static patterns presented in regular
                 expressions, we apply pool allocation to repeatedly
                 accessed structures and exploit field layout
                 restructuring according to field affinity relations of
                 chosen structures. To make a scalable framework, we
                 devise and apply new abstraction techniques, which
                 build and interpret access patterns for the whole
                 programs in a bottom-up fashion. We implement our
                 analyses and transformations with the CIL compiler. To
                 verify the effect and scalability of our scheme, we
                 examine 17 benchmarks including 2 SPECINT 2000
                 benchmarks whose source lines of code are larger than
                 10,000. Our experiments demonstrate that the static
                 layout transformations for dynamic memory can reduce
                 L1D cache misses by 16\% and execution times by 14\% on
                 average.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Access patterns; field affinity; layout
                 transformation; pool allocation; regular expressions",
}

@Article{Shobaki:2009:OTS,
  author =       "Ghassan Shobaki and Kent Wilken and Mark Heffernan",
  title =        "Optimal trace scheduling using enumeration",
  journal =      j-TACO,
  volume =       "5",
  number =       "4",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1498690.1498694",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Mar 18 21:35:33 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents the first optimal algorithm for
                 trace scheduling. The trace is a global scheduling
                 region used by compilers to exploit instruction-level
                 parallelism across basic block boundaries. Several
                 heuristic techniques have been proposed for trace
                 scheduling, but the precision of these techniques has
                 not been studied relative to optimality. This article
                 describes a technique for finding provably optimal
                 trace schedules, where optimality is defined in terms
                 of a weighted sum of schedule lengths across all code
                 paths in a trace. The optimal algorithm uses
                 branch-and-bound enumeration to efficiently explore the
                 entire solution space. Experimental evaluation of the
                 algorithm shows that, with a time limit of 1 second per
                 problem, 91\% of the hard trace scheduling problems in
                 the SPEC CPU 2006 Integer Benchmarks are solved
                 optimally. For 58\% of these hard problems, the optimal
                 schedule is improved compared to that produced by a
                 heuristic scheduler with a geometric mean improvement
                 of 3.2\% in weighted schedule length and 18\% in
                 compensation code size.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "branch-and-bound enumeration; compiler optimizations;
                 global instruction scheduling; instruction-level
                 parallelism; Instruction scheduling; optimal
                 instruction scheduling; trace scheduling",
}

@Article{Kulkarni:2009:PEO,
  author =       "Prasad A. Kulkarni and David B. Whalley and Gary S.
                 Tyson and Jack W. Davidson",
  title =        "Practical exhaustive optimization phase order
                 exploration and evaluation",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1509864.1509865",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Choosing the most appropriate optimization phase
                 ordering has been a long-standing problem in compiler
                 optimizations. Exhaustive evaluation of all possible
                 orderings of optimization phases for each function is
                 generally dismissed as infeasible for
                 production-quality compilers targeting accepted
                 benchmarks. In this article, we show that it is
                 possible to exhaustively evaluate the optimization
                 phase order space for each function in a reasonable
                 amount of time for most of the functions in our
                 benchmark suite. To achieve this goal, we used various
                 techniques to significantly prune the optimization
                 phase order search space so that it can be
                 inexpensively enumerated in most cases and reduce the
                 number of program simulations required to evaluate
                 program performance for each distinct phase ordering.
                 The techniques described are applicable to other
                 compilers in which it is desirable to find the best
                 phase ordering for most functions in a reasonable
                 amount of time. We also describe some interesting
                 properties of the optimization phase order space, which
                 will prove useful for further studies of related
                 problems in compilers.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "exhaustive search; iterative compilation; Phase
                 ordering",
}

@Article{Hohenauer:2009:SOF,
  author =       "Manuel Hohenauer and Felix Engel and Rainer Leupers
                 and Gerd Ascheid and Heinrich Meyr",
  title =        "A {SIMD} optimization framework for retargetable
                 compilers",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1509864.1509866",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Retargetable C compilers are currently widely used to
                 quickly obtain compiler support for new embedded
                 processors and to perform early processor architecture
                 exploration. A partially inherent problem of the
                 retargetable compilation approach, though, is the
                 limited code quality as compared to hand-written
                 compilers or assembly code due to the lack of dedicated
                 optimizations techniques. This problem can be
                 circumvented by designing flexible, retargetable code
                 optimization techniques that apply to a certain range
                 of target architectures. This article focuses on target
                 machines with SIMD instruction support, a common
                 feature in embedded processors for multimedia
                 applications. However, SIMD optimization is known to be
                 a difficult task since SIMD architectures are largely
                 nonuniform, support only a limited set of data types
                 and impose several memory alignment constraints.
                 Additionally, such techniques require complicated loop
                 transformations, which are tailored to the SIMD
                 architecture in order to exhibit the necessary amount
                 of parallelism in the code. Thus, integrating the SIMD
                 optimization {\em and\/} the required loop
                 transformations together in a single retargeting
                 formalism is an ambitious challenge. In this article,
                 we present an efficient and quickly retargetable SIMD
                 code optimization framework that is integrated into an
                 industrial retargetable C compiler. Experimental
                 results for different processors demonstrate that the
                 proposed technique applies to real-life target machines
                 and that it produces code quality improvements close to
                 the theoretical limit.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "ASIP; retargetable compilers; SIMD; subword
                 parallelism; vectorization",
}

@Article{Eyerman:2009:MLP,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Memory-level parallelism aware fetch policies for
                 simultaneous multithreading processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1509864.1509867",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A thread executing on a simultaneous multithreading
                 (SMT) processor that experiences a long-latency load
                 will eventually stall while holding execution
                 resources. Existing long-latency load aware SMT fetch
                 policies limit the amount of resources allocated by a
                 stalled thread by identifying long-latency loads and
                 preventing the thread from fetching more instructions
                 --- and in some implementations, instructions beyond
                 the long-latency load are flushed to release allocated
                 resources.\par

                 This article proposes an SMT fetch policy that takes
                 into account the available memory-level parallelism
                 (MLP) in a thread. The key idea proposed in this
                 article is that in case of an isolated long-latency
                 load (i.e., there is no MLP), the thread should be
                 prevented from allocating additional resources.
                 However, in case multiple independent long-latency
                 loads overlap (i.e., there is MLP), the thread should
                 allocate as many resources as needed in order to fully
                 expose the available MLP. MLP-aware fetch policies
                 achieve better performance for MLP-intensive threads on
                 SMT processors, leading to higher overall system
                 throughput and shorter average turnaround time than
                 previously proposed fetch policies.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Fetch Policy; Memory-Level Parallelism (MLP);
                 Simultaneous Multithreading (SMT)",
}

@Article{Strozek:2009:EAE,
  author =       "Lukasz Strozek and David Brooks",
  title =        "Energy- and area-efficient architectures through
                 application clustering and architectural
                 heterogeneity",
  journal =      j-TACO,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1509864.1509868",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu May 7 14:55:25 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Customizing architectures for particular applications
                 is a promising approach to yield highly
                 energy-efficient designs for embedded systems. This
                 work explores the benefits of architectural
                 customization for a class of embedded architectures
                 typically used in energy- and area-constrained
                 application domains, such as sensor nodes and
                 multimedia processing. We implement a process flow that
                 performs an automatic synthesis and evaluation of the
                 different architectures based on runtime profiles of
                 applications and determines an efficient architecture,
                 with consideration for both energy and area
                 constraints. An expressive architectural model, used by
                 our engine, is introduced that takes advantage of
                 efficient opcode allocation, several memory addressing
                 modes, and operand types. By profiling embedded
                 benchmarks from a variety of sensor and multimedia
                 applications, we show that the energy savings resulting
                 from various architectural optimizations relative to
                 the base architectures (e.g., MIPS and MSP430) are
                 significant and can reach 50\%, depending on the
                 application. We then identify the set of architectures
                 that achieves near-optimal savings for a group of
                 applications. Finally, we propose the use of
                 heterogeneous ISA processors implementing those
                 architectures as a solution to capitalize on energy
                 savings provided by application customization while
                 executing a range of applications efficiently.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Efficient custom architectures; heterogeneous ISA
                 processors",
}

@Article{Venkataramani:2009:MAM,
  author =       "Guru Venkataramani and Ioannis Doudalis and Yan
                 Solihin and Milos Prvulovic",
  title =        "{MemTracker}: {An} accelerator for memory debugging
                 and monitoring",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1543753.1543754",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory bugs are a broad class of bugs that is becoming
                 increasingly common with increasing software
                 complexity, and many of these bugs are also security
                 vulnerabilities. Existing software and hardware
                 approaches for finding and identifying memory bugs have
                 a number of drawbacks including considerable
                 performance overheads, target only a specific type of
                 bug, implementation cost, and inefficient use of
                 computational resources.\par

                 This article describes MemTracker, a new hardware
                 support mechanism that can be configured to perform
                 different kinds of memory access monitoring tasks.
                 MemTracker associates each word of data in memory with
                 a few bits of state, and uses a programmable state
                 transition table to react to different events that can
                 affect this state. The number of state bits per word,
                 the events to which MemTracker reacts, and the
                 transition table are all fully programmable.
                 MemTracker's rich set of states, events, and
                 transitions can be used to implement different
                 monitoring and debugging checkers with minimal
                 performance overheads, even when frequent state updates
                 are needed. To evaluate MemTracker, we map three
                 different checkers onto it, as well as a checker that
                 combines all three. For the most demanding (combined)
                 checker with 8 bits state per memory word, we observe
                 performance overheads of only around 3\%, on average,
                 and 14.5\% worst-case across different benchmark
                 suites. Such low overheads allow continuous (always-on)
                 use of MemTracker-enabled checkers, even in production
                 runs.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Accelerator; debugging; memory access monitoring",
}

@Article{Gabor:2009:SLA,
  author =       "Ron Gabor and Avi Mendelson and Shlomo Weiss",
  title =        "Service level agreement for multithreaded processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1543753.1543755",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multithreading is widely used to increase processor
                 throughput. As the number of shared resources increase,
                 managing them while guaranteeing predicted performance
                 becomes a major problem. Attempts have been made in
                 previous work to ease this via different fairness
                 mechanisms. In this article, we present a new approach
                 to control the resource allocation and sharing via a
                 service level agreement (SLA)-based mechanism; that is,
                 via an agreement in which multithreaded processors
                 guarantee a minimal level of service to the running
                 threads. We introduce a new metric, {\em C\/}$_{SLA}$,
                 for conformance to SLA in multithreaded processors and
                 show that controlling resources using with SLA allows
                 for higher gains than are achievable by previously
                 suggested fairness techniques. It also permits
                 improving one metric (e.g., power) while maintaining
                 SLA in another (e.g., performance). We compare SLA
                 enforcement to schemes based on other fairness metrics,
                 which are mostly targeted at equalizing execution
                 parameters. We show that using SLA rather than fairness
                 based algorithms provides a range of acceptable
                 execution points from which we can select the point
                 that best fits our optimization target, such as
                 maximizing the weighted speedup (sum of the speedups of
                 the individual threads) or reducing power. We
                 demonstrate the effectiveness of the new SLA approach
                 using switch-on-event (coarse-grained) multithreading.
                 Our weighted speedup improvement scheme successfully
                 enforces SLA while improving the weighted speedup by an
                 average of 10\% for unbalanced threads. This result is
                 significant when compared with performance losses that
                 may be incurred by fairness enforcement methods. When
                 optimizing for power reduction in unbalanced threads
                 SLA enforcement reduces the power by an average of
                 15\%. SLA may be complemented by other power reduction
                 methods to achieve further power savings {\em and\/}
                 maintain the same service level for the threads. We
                 also demonstrate differentiated SLA, where weighted
                 speedup is maximized while each thread may have a
                 different throughput constraint.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "fairness; performance; power; Service level agreement;
                 throughput",
}

@Article{Fung:2009:DWF,
  author =       "Wilson W. L. Fung and Ivan Sham and George Yuan and
                 Tor M. Aamodt",
  title =        "Dynamic warp formation: {Efficient MIMD} control flow
                 on {SIMD} graphics hardware",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1543753.1543756",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent advances in graphics processing units (GPUs)
                 have resulted in massively parallel hardware that is
                 easily programmable and widely available in today's
                 desktop and notebook computer systems. GPUs typically
                 use single-instruction, multiple-data (SIMD) pipelines
                 to achieve high performance with minimal overhead for
                 control hardware. Scalar threads running the same
                 computing kernel are grouped together into SIMD
                 batches, sometimes referred to as warps. While SIMD is
                 ideally suited for simple programs, recent GPUs include
                 control flow instructions in the GPU instruction set
                 architecture and programs using these instructions may
                 experience reduced performance due to the way branch
                 execution is supported in hardware. One solution is to
                 add a stack to allow different SIMD processing elements
                 to execute distinct program paths after a branch
                 instruction. The occurrence of diverging branch
                 outcomes for different processing elements
                 significantly degrades performance using this approach.
                 In this article, we propose dynamic warp formation and
                 scheduling, a mechanism for more efficient SIMD branch
                 execution on GPUs. It dynamically regroups threads into
                 new warps on the fly following the occurrence of
                 diverging branch outcomes. We show that a realistic
                 hardware implementation of this mechanism improves
                 performance by 13\%, on average, with 256 threads per
                 core, 24\% with 512 threads, and 47\% with 768 threads
                 for an estimated area increase of 8\%.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "control flow; fine-grained multithreading; GPU; SIMD",
}

@Article{Koh:2009:TPV,
  author =       "Cheng-Kok Koh and Weng-Fai Wong and Yiran Chen and Hai
                 Li",
  title =        "Tolerating process variations in large,
                 set-associative caches: {The} buddy cache",
  journal =      j-TACO,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1543753.1543757",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Jul 2 12:32:04 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "One important trend in today's microprocessor
                 architectures is the increase in size of the processor
                 caches. These caches also tend to be set associative.
                 As technology scales, process variations are expected
                 to increase the fault rates of the SRAM cells that
                 compose such caches. As an important component of the
                 processor, the parametric yield of SRAM cells is
                 crucial to the overall performance and yield of the
                 microchip. In this article, we propose a
                 microarchitectural solution, called the buddy cache
                 that permits large, set-associative caches to tolerate
                 faults in SRAM cells due to process variations. In
                 essence, instead of disabling a faulty cache block in a
                 set (as is the current practice), it is paired with
                 another faulty cache block in the same set --- the
                 buddy. Although both cache blocks are faulty, if the
                 faults of the two blocks do not overlap, then instead
                 of losing two blocks, buddying will yield a functional
                 block from the nonfaulty portions of the two blocks. We
                 found that with buddying, caches can better mitigate
                 the negative impacts of process variations on
                 performance and yield, gracefully downgrading
                 performance as opposed to catastrophic failure. We will
                 describe the details of the buddy cache and give
                 insights as to why it is both more performance and
                 yield resilient to faults.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "caches; fault recovery; memory structures; Processor
                 architectures",
}

@Article{Li:2009:CDS,
  author =       "Lian Li and Hui Feng and Jingling Xue",
  title =        "Compiler-directed scratchpad memory management via
                 graph coloring",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1582710.1582711",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Scratchpad memory (SPM), a fast on-chip SRAM managed
                 by software, is widely used in embedded systems. This
                 article introduces a general-purpose compiler approach,
                 called memory coloring, to assign static data
                 aggregates, such as arrays and structs, in a program to
                 an SPM. The novelty of this approach lies in
                 partitioning the SPM into a pseudo--register file (with
                 interchangeable and aliased registers), splitting the
                 live ranges of data aggregates to create potential data
                 transfer statements between SPM and off-chip memory,
                 and finally, adapting an existing graph coloring
                 algorithm for register allocation to assign the data
                 aggregates to the pseudo--register file. Our
                 experimental results using a set of 10 C benchmarks
                 from MediaBench and MiBench show that our methodology
                 is capable of managing SPMs efficiently and effectively
                 for large embedded applications. In addition, our SPM
                 allocator can obtain close to optimal solutions when
                 evaluated and compared against an existing
                 heuristics-based SPM allocator and an ILP-based SPM
                 allocator.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "graph coloring; live range splitting; memory
                 allocation; memory coloring; register coalescing;
                 Scratchpad memory; software-managed cache",
}

@Article{Golander:2009:CAR,
  author =       "Amit Golander and Shlomo Weiss",
  title =        "Checkpoint allocation and release",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1582710.1582712",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Out-of-order speculative processors need a bookkeeping
                 method to recover from incorrect speculation. In recent
                 years, several microarchitectures that employ
                 checkpoints have been proposed, either extending the
                 reorder buffer or entirely replacing it. This work
                 presents an in-dept-study of checkpointing in
                 checkpoint-based microarchitectures, from the desired
                 content of a checkpoint, via implementation trade-offs,
                 and to checkpoint allocation and release policies. A
                 major contribution of the article is a novel adaptive
                 checkpoint allocation policy that outperforms known
                 policies. The adaptive policy controls checkpoint
                 allocation according to dynamic events, such as
                 second-level cache misses and rollback history. It
                 achieves 6.8\% and 2.2\% speedup for the integer and
                 floating point benchmarks, respectively, and does not
                 require a branch confidence estimator. The results show
                 that the proposed adaptive policy achieves most of the
                 potential of an oracle policy whose performance
                 improvement is 9.8\% and 3.9\% for the integer and
                 floating point benchmarks, respectively. We exploit
                 known techniques for saving leakage power by adapting
                 and applying them to checkpoint-based
                 microarchitectures. The proposed applications combine
                 to reduce the leakage power of the register file to
                 about one half of its original value.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Checkpoint; early register release; leakage;
                 misprediction; out-of-order execution; rollback",
}

@Article{Xu:2009:TXP,
  author =       "Weifeng Xu and Russell Tessier",
  title =        "{Tetris-XL}: a performance-driven spill reduction
                 technique for embedded {VLIW} processors",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1582710.1582713",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As technology has advanced, the application space of
                 Very Long Instruction Word (VLIW) processors has grown
                 to include a variety of embedded platforms. Due to cost
                 and power consumption constraints, many embedded VLIW
                 processors contain limited resources, including
                 registers. As a result, a VLIW compiler that maximizes
                 instruction level parallelism (ILP) without considering
                 register constraints may generate excessive register
                 spills, leading to reduced overall system performance.
                 To address this issue, this article presents a new
                 spill reduction technique that improves VLIW runtime
                 performance by reordering operations prior to register
                 allocation and instruction scheduling. Unlike earlier
                 algorithms, our approach explicitly considers both
                 register reduction and data dependency in performing
                 operation reordering. Data dependency control limits
                 unexpected schedule length increases during subsequent
                 instruction scheduling. Our technique has been
                 evaluated using Trimaran, an academic VLIW compiler,
                 and evaluated using a set of embedded systems
                 benchmarks. Experimental results show that, on average,
                 this technique improves VLIW performance by 10\% for
                 VLIW processors with 32 registers and 8 functional
                 units compared with previous spill reduction
                 techniques. Limited improvement is seen versus prior
                 approaches for VLIW processors with 64 registers and 8
                 functional units.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "instruction level parallelism; Register pressure; Very
                 Long Instruction Word (VLIW) processor",
}

@Article{Jones:2009:ELE,
  author =       "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
                 Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
  title =        "Exploring the limits of early register release:
                 {Exploiting} compiler analysis",
  journal =      j-TACO,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1582710.1582714",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Thu Oct 1 09:20:47 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Register pressure in modern superscalar processors can
                 be reduced by releasing registers early and by copying
                 their contents to cheap back-up storage. This article
                 quantifies the potential benefits of register occupancy
                 reduction and shows that existing hardware-based
                 schemes typically achieve only a small fraction of this
                 potential. This is because they are unable to
                 accurately determine the last use of a register and
                 must wait until the redefining instruction enters the
                 pipeline. On the other hand, compilers have a global
                 view of the program and, using simple dataflow
                 analysis, can determine the last use. This article
                 evaluates the extent to which compiler analysis can aid
                 early releasing, explores the design space, and
                 introduces commit and issue-based early releasing
                 schemes, quantifying their benefits. Using simple
                 compiler analysis and microarchitecture changes, we
                 achieve 70\% of the potential register file occupancy
                 reduction. By adding more hardware support, we can
                 increase this to 94\%. Our schemes are compared to
                 state-of-the-art approaches for varying register file
                 sizes and are shown to outperform these existing
                 techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "compiler; energy efficiency; Low-power design;
                 microarchitecture; register file",
}

@Article{Jones:2009:EER,
  author =       "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
                 Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
  title =        "Energy-efficient register caching with compiler
                 assistance",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2009:TUC,
  author =       "Weijia Li and Youtao Zhang and Jun Yang and Jiang
                 Zheng",
  title =        "Towards update-conscious compilation for
                 energy-efficient code dissemination in {WSNs}",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wegiel:2009:SRC,
  author =       "Michal Wegiel and Chandra Krintz",
  title =        "The single-referent collector: {Optimizing} compaction
                 for the common case",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Subramaniam:2009:DOS,
  author =       "Samantika Subramaniam and Gabriel H. Loh",
  title =        "Design and optimization of the store vectors memory
                 dependence predictor",
  journal =      j-TACO,
  volume =       "6",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Mar 15 18:49:43 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2010:PAM,
  author =       "Xiaohang Wang and Mei Yang and Yingtao Jiang and Peng
                 Liu",
  title =        "A power-aware mapping approach to map {IP} cores onto
                 {NoCs} under bandwidth and latency constraints",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1736065.1736066",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we investigate the Intellectual
                 Property (IP) mapping problem that maps a given set of
                 IP cores onto the tiles of a mesh-based Network-on-Chip
                 (NoC) architecture such that the power consumption due
                 to intercore communications is minimized. This IP
                 mapping problem is considered under both bandwidth and
                 latency constraints as imposed by the applications and
                 the on-chip network infrastructure. By examining
                 various applications' communication characteristics
                 extracted from their respective communication trace
                 graphs, two distinguishable connectivity templates are
                 realized: the graphs with tightly coupled vertices and
                 those with distributed vertices. These two templates
                 are formally defined in this article, and different
                 mapping heuristics are subsequently developed to map
                 them. In general, tightly coupled vertices are mapped
                 onto tiles that are physically close to each other
                 while the distributed vertices are mapped following a
                 graph partition scheme. Experimental results on both
                 random and multimedia benchmarks have confirmed that
                 the proposed template-based mapping algorithm achieves
                 an average of 15\% power savings as compared with MOCA,
                 a fast greedy-based mapping algorithm. Compared with a
                 branch-and-bound--based mapping algorithm, which
                 produces near optimal results but incurs an extremely
                 high computation cost, the proposed algorithm, due to
                 its polynomial runtime complexity, can generate the
                 results of almost the same quality with much less CPU
                 time. As the on-chip network size increases, the
                 superiority of the proposed algorithm becomes more
                 evident.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "bandwidth and latency constraints; IP mapping; Low
                 power; network-on-chip (NoC)",
}

@Article{Chen:2010:HSF,
  author =       "Zhong-Ho Chen and Alvin W. Y. Su",
  title =        "A hardware\slash software framework for instruction
                 and data scratchpad memory allocation",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1736065.1736067",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Previous researches show that a scratchpad memory
                 device consumed less energy than a cache device with
                 the same capacity. In this article, we locate the
                 scratchpad memory (SPM) in the top level of the memory
                 hierarchy to reduce the energy consumption. To take the
                 advantage of a SPM, we address two issues of utilizing
                 a SPM. First, the program's locality should be
                 improved. The second issue is SPM management. To tackle
                 these two issues, we present a hardware/software
                 framework for dynamically allocating both instructions
                 and data in SPM. The software flow could be divided
                 into three phases: locality improving, locality
                 extraction, and runtime SPM management. Without
                 modifying the original compiler and the source code, we
                 improve the locality of a program. An optimization
                 algorithm is proposed to extract the SPM allocations.
                 At runtime, an SPM management program is employed. In
                 hardware, an address translation logic (ATL) is
                 proposed to reduce the overhead of SPM
                 management.\par

                 The results show that the proposed framework can reduce
                 energy delay product (EDP) by 63\%, on average, when
                 compared with the traditional cache architecture. The
                 reduction in EDP is contributed by properly allocating
                 both instructions and data in SPM. By allocating only
                 instructions in SPM, the EDPs are reduced by 45\%, on
                 average. By allocating only data in SPM, the EDPs are
                 reduced by 14\%, on average.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "allocation algorithm; Memory allocation; scratchpad
                 memory",
}

@Article{Woo:2010:CVI,
  author =       "Dong Hyuk Woo and Joshua B. Fryman and Allan D. Knies
                 and Hsien-Hsin S. Lee",
  title =        "{Chameleon}: {Virtualizing} idle acceleration cores of
                 a heterogeneous multicore processor for caching and
                 prefetching",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1736065.1736068",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Heterogeneous multicore processors have emerged as an
                 energy- and area-efficient architectural solution to
                 improving performance for domain-specific applications
                 such as those with a plethora of data-level
                 parallelism. These processors typically contain a large
                 number of small, compute-centric cores for acceleration
                 while keeping one or two high-performance ILP cores on
                 the die to guarantee single-thread performance.
                 Although a major portion of the transistors are
                 occupied by the acceleration cores, these resources
                 will sit idle when running unparallelized legacy codes
                 or the sequential part of an application. To address
                 this underutilization issue, in this article, we
                 introduce Chameleon, a flexible heterogeneous multicore
                 architecture to virtualize these resources for
                 enhancing memory performance when running sequential
                 programs. The Chameleon architecture can dynamically
                 virtualize the idle acceleration cores into a
                 last-level cache, a data prefetcher, or a hybrid
                 between these two techniques. In addition, Chameleon
                 can operate in an adaptive mode that dynamically
                 configures the acceleration cores between the hybrid
                 mode and the prefetch-only mode by monitoring the
                 effectiveness of the Chameleon cache mode. In our
                 evaluation with SPEC2006 benchmark suite, different
                 levels of performance improvements were achieved in
                 different modes for different applications. In the case
                 of the adaptive mode, Chameleon improves the
                 performance of SPECint06 and SPECfp06 by 31\% and 15\%,
                 on average. When considering only memory-intensive
                 applications, Chameleon improves the system performance
                 by 50\% and 26\% for SPECint06 and SPECfp06,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache; Heterogeneous multicore; idle core;
                 prefetching",
}

@Article{Sanchez:2010:ACI,
  author =       "Daniel Sanchez and George Michelogiannakis and
                 Christos Kozyrakis",
  title =        "An analysis of on-chip interconnection networks for
                 large-scale chip multiprocessors",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1756065.1736069",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the number of cores of chip multiprocessors
                 (CMPs) rapidly growing as technology scales down,
                 connecting the different components of a CMP in a
                 scalable and efficient way becomes increasingly
                 challenging. In this article, we explore the
                 architectural-level implications of interconnection
                 network design for CMPs with up to 128 fine-grain
                 multithreaded cores. We evaluate and compare different
                 network topologies using accurate simulation of the
                 full chip, including the memory hierarchy and
                 interconnect, and using a diverse set of scientific and
                 engineering workloads.\par

                 We find that the interconnect has a large impact on
                 performance, as it is responsible for 60\% to 75\% of
                 the miss latency. Latency, and not bandwidth, is the
                 primary performance constraint, since, even with many
                 threads per core and workloads with high miss rates,
                 networks with enough bandwidth can be efficiently
                 implemented for the system scales we consider. From the
                 topologies we study, the flattened butterfly
                 consistently outperforms the mesh and fat tree on all
                 workloads, leading to performance advantages of up to
                 22\%. We also show that considering interconnect and
                 memory hierarchy together when designing large-scale
                 CMPs is crucial, and neglecting either of the two can
                 lead to incorrect conclusions. Finally, the effect of
                 the interconnect on overall performance becomes more
                 important as the number of cores increases, making
                 interconnection choices especially critical when
                 scaling up.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "chip multiprocessors; hierarchical networks;
                 Networks-on-chip",
}

@Article{Zhou:2010:PAT,
  author =       "Xiuyi Zhou and Jun Yang and Marek Chrobak and Youtao
                 Zhang",
  title =        "Performance-aware thermal management via task
                 scheduling",
  journal =      j-TACO,
  volume =       "7",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1746065.1736070",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed May 5 15:38:13 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High on-chip temperature impairs the processor's
                 reliability and reduces its lifetime. Hardware-level
                 dynamic thermal management (DTM) techniques can
                 effectively constrain the chip temperature, but
                 degrades the performance. We propose an OS-level
                 technique that performs thermal-aware job scheduling to
                 reduce DTMs. The algorithm is based on the observation
                 that hot and cool jobs executed in a different order
                 can make a difference in resulting temperature.
                 Real-system implementation in Linux shows that our
                 scheduler can remove 10.5\% to 73.6\% of the hardware
                 DTMs in a medium thermal environment. The CPU
                 throughput is improved by up to 7.6\% (4.1\%, on
                 average) in a severe thermal environment.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "task scheduling; Thermal management",
}

@Article{Raghavan:2010:TTP,
  author =       "Arun Raghavan and Colin Blundell and Milo M. K.
                 Martin",
  title =        "Token tenure and {PATCH}: a predictive\slash
                 adaptive token-counting hybrid",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839667.1839668",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Traditional coherence protocols present a set of
                 difficult trade-offs: the reliance of snoopy protocols
                 on broadcast and ordered interconnects limits their
                 scalability, while directory protocols incur a
                 performance penalty on sharing misses due to
                 indirection. This work introduces Patch
                 (Predictive/Adaptive Token-Counting Hybrid), a
                 coherence protocol that provides the scalability of
                 directory protocols while opportunistically sending
                 direct requests to reduce sharing latency. Patch
                 extends a standard directory protocol to track tokens
                 and use token-counting rules for enforcing coherence
                 permissions. Token counting allows Patch to support
                 direct requests on an unordered interconnect, while a
                 mechanism called {\em token tenure\/} provides
                 broadcast-free forward progress using the directory
                 protocol's per-block point of ordering at the home
                 along with either timeouts at requesters or explicit
                 race notification messages.\par

                 Patch makes three main contributions. First, Patch
                 introduces token tenure, which provides broadcast-free
                 forward progress for token-counting protocols. Second,
                 Patch deprioritizes best-effort direct requests to
                 match or exceed the performance of directory protocols
                 without restricting scalability. Finally, Patch
                 provides greater scalability than directory protocols
                 when using inexact encodings of sharers because only
                 processors holding tokens need to acknowledge requests.
                 Overall, Patch is a ``one-size-fits-all'' coherence
                 protocol that dynamically adapts to work well for small
                 systems, large systems, and anywhere in between.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "adaptive; bandwidth-efficiency; Cache coherence
                 protocol; predictive; token coherence",
}

@Article{Wimmer:2010:AFD,
  author =       "Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}sck",
  title =        "Automatic feedback-directed object fusing",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839667.1839669",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Object fusing is an optimization that embeds certain
                 referenced objects into their referencing object. The
                 order of objects on the heap is changed in such a way
                 that objects that are accessed together are placed next
                 to each other in memory. Their offset is then fixed,
                 that is, the objects are colocated, allowing field
                 loads to be replaced by address arithmetic. Array
                 fusing specifically optimizes arrays, which are
                 frequently used for the implementation of dynamic data
                 structures. Therefore, the length of arrays often
                 varies, and fields referencing such arrays have to be
                 changed. An efficient code pattern detects these
                 changes and allows the optimized access of such
                 fields.\par

                 We integrated these optimizations into Sun
                 Microsystems' Java HotSpot\TM{} VM. The analysis is
                 performed automatically at runtime, requires no actions
                 on the part of the programmer, and supports dynamic
                 class loading. To safely eliminate a field load, the
                 colocation of the object that holds the field and the
                 object that is referenced by the field must be
                 guaranteed. Two preconditions must be satisfied: The
                 objects must be allocated at the same time, and the
                 field must not be overwritten later. These
                 preconditions are checked by the just-in-time compiler
                 to avoid an interprocedural data flow analysis. The
                 garbage collector ensures that groups of colocated
                 objects are not split by copying groups as a whole. The
                 evaluation shows that the dynamic approach successfully
                 identifies and optimizes frequently accessed fields for
                 several benchmarks with a low compilation and analysis
                 overhead. It leads to a speedup of up to 76\% for
                 simple benchmarks and up to 6\% for complex
                 workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "cache performance; garbage collection; Java;
                 just-in-time compilation; object colocation; object
                 fusing; object inlining; optimization",
}

@Article{Lee:2010:AIC,
  author =       "Benjamin C. Lee and David Brooks",
  title =        "Applied inference: {Case} studies in
                 microarchitectural design",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839667.1839670",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose and apply a new simulation paradigm for
                 microarchitectural design evaluation and optimization.
                 This paradigm enables more comprehensive design studies
                 by combining spatial sampling and statistical
                 inference. Specifically, this paradigm (i) defines a
                 large, comprehensive design space, (ii) samples points
                 from the space for simulation, and (iii) constructs
                 regression models based on sparse simulations. This
                 approach greatly improves the computational efficiency
                 of microarchitectural simulation and enables new
                 capabilities in design space exploration.\par

                 We illustrate new capabilities in three case studies
                 for a large design space of approximately 260,000
                 points: (i) Pareto frontier, (ii) pipeline depth, and
                 (iii) multiprocessor heterogeneity analyses. In
                 particular, regression models are exhaustively
                 evaluated to identify Pareto optimal designs that
                 maximize performance for given power budgets. These
                 models enable pipeline depth studies in which all
                 parameters vary simultaneously with depth, thereby more
                 effectively revealing interactions with nondepth
                 parameters. Heterogeneity analysis combines
                 regression-based optimization with clustering
                 heuristics to identify efficient design compromises
                 between similar optimal architectures. These
                 compromises are potential core designs in a
                 heterogeneous multicore architecture. Increasing
                 heterogeneity can improve {\em bips\/}$^3$ / {\em w\/}
                 efficiency by as much as 2.4\times , a theoretical
                 upper bound on heterogeneity benefits that neglects
                 contention between shared resources as well as design
                 complexity. Collectively these studies demonstrate
                 regression models' ability to expose trends and
                 identify optima in diverse design regions, motivating
                 the application of such models in statistical inference
                 for more effective use of modern simulator
                 infrastructure.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "Microarchitecture; regression; simulation;
                 statistics",
}

@Article{Rakvic:2010:TMT,
  author =       "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G.
                 Magklis and P. Chaparro and A. Gonz{\'a}lez",
  title =        "Thread-management techniques to maximize efficiency in
                 multicore and simultaneous multithreaded
                 microprocessors",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839667.1839671",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We provide an analysis of thread-management techniques
                 that increase performance or reduce energy in multicore
                 and Simultaneous Multithreaded (SMT) cores. Thread
                 delaying reduces energy consumption by running the core
                 containing the critical thread at maximum frequency
                 while scaling down the frequency and voltage of the
                 cores containing noncritical threads. In this article,
                 we provide an insightful breakdown of thread delaying
                 on a simulated multi-core microprocessor. Thread
                 balancing improves overall performance by giving higher
                 priority to the critical thread in the issue queue of
                 an SMT core. We provide a detailed breakdown of
                 performance results for thread-balancing, identifying
                 performance benefits and limitations. For those
                 benchmarks where a performance benefit is not possible,
                 we introduce a novel thread-balancing mechanism on an
                 SMT core that can reduce energy consumption. We have
                 performed a detailed study on an Intel microprocessor
                 simulator running parallel applications. Thread
                 delaying can reduce energy consumption by 4\% to 44\%
                 with negligible performance loss. Thread balancing can
                 increase performance by 20\% or can reduce energy
                 consumption by 23\%.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "critical threads; energy-aware; low-power; Meeting
                 point thread characterization; microarchitecture;
                 multi-threaded application; thread balancing; thread
                 delaying",
}

@Article{Pao:2010:MEP,
  author =       "Derek Pao and Wei Lin and Bin Liu",
  title =        "A memory-efficient pipelined implementation of the
                 {Aho--Corasick} string-matching algorithm",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839667.1839672",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With rapid advancement in Internet technology and
                 usages, some emerging applications in data
                 communications and network security require matching of
                 huge volume of data against large signature sets with
                 thousands of strings in real time. In this article, we
                 present a memory-efficient hardware implementation of
                 the well-known Aho--Corasick (AC) string-matching
                 algorithm using a pipelining approach called P-AC. An
                 attractive feature of the AC algorithm is that it can
                 solve the string-matching problem in time linearly
                 proportional to the length of the input stream, and the
                 computation time is independent of the number of
                 strings in the signature set. A major disadvantage of
                 the AC algorithm is the high memory cost required to
                 store the transition rules of the underlying
                 deterministic finite automaton. By incorporating
                 pipelined processing, the state graph is reduced to a
                 character trie that only contains forward edges.
                 Together with an intelligent implementation of look-up
                 tables, the memory cost of P-AC is only about 18 bits
                 per character for a signature set containing 6,166
                 strings extracted from Snort. The control structure of
                 P-AC is simple and elegant. The cost of the control
                 logic is very low. With the availability of dual-port
                 memories in FPGA devices, we can double the system
                 throughput by duplicating the control logic such that
                 the system can process two data streams concurrently.
                 Since our method is memory-based, incremental changes
                 to the signature set can be accommodated by updating
                 the look-up tables without reconfiguring the FPGA
                 circuitry.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "deterministic and nondeterministic finite automaton;
                 intrusion detection system; pipelined processing;
                 String-matching",
}

@Article{Yang:2010:ERS,
  author =       "Xuejun Yang and Ying Zhang and Xicheng Lu and Jingling
                 Xue and Ian Rogers and Gen Li and Guibin Wang and
                 Xudong Fang",
  title =        "Exploiting the reuse supplied by loop-dependent stream
                 references for stream processors",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839667.1839673",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory accesses limit the performance of stream
                 processors. By exploiting the reuse of data held in the
                 Stream Register File (SRF), an on-chip, software
                 controlled storage, the number of memory accesses can
                 be reduced. In current stream compilers, reuse
                 exploitation is only attempted for simple stream
                 references, those whose start and end are known.
                 Compiler analysis, from outside of stream processors,
                 does not directly enable the consideration of other
                 more complex stream references. In this article, we
                 propose a transformation to automatically optimize
                 stream programs to exploit the reuse supplied by
                 loop-dependent stream references. The transformation is
                 based on three results: lemmas identifying the reuse
                 supplied by stream references, a new abstract
                 representation called the Stream Reuse Graph (SRG)
                 depicting the identified reuse, and the optimization of
                 the SRG for our transformation. Both the reuse between
                 the whole sequences accessed by stream references and
                 between partial sequences is exploited in the article.
                 In particular, partial reuse and its treatment are
                 quite new and have never, to the best of our knowledge,
                 appeared in scalar and vector processing. At the same
                 time, reusing streams increases the pressure on the
                 SRF, and this presents a problem of which reuse should
                 be exploited within limited SRF capacity. We extend our
                 analysis to achieve this objective. Finally, we
                 implement our techniques based on the StreamC/KernelC
                 compiler that has been optimized with the best existing
                 compilation techniques for stream processors.
                 Experimental results show a resultant speed-up of 1.14
                 to 2.54 times using a range of benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "streamc; stream professor; Stream programming model;
                 stream register file; stream reuse",
}

@Article{Reddi:2010:EVE,
  author =       "Vijay Janapa Reddi and Simone Campanoni and Meeta S.
                 Gupta and Michael D. Smith and Gu-Yeon Wei and David
                 Brooks and Kim Hazelwood",
  title =        "Eliminating voltage emergencies via software-guided
                 code transformations",
  journal =      j-TACO,
  volume =       "7",
  number =       "2",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839667.1839674",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Sat Oct 2 18:05:46 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In recent years, circuit reliability in modern
                 high-performance processors has become increasingly
                 important. Shrinking feature sizes and diminishing
                 supply voltages have made circuits more sensitive to
                 microprocessor supply voltage fluctuations. These
                 fluctuations result from the natural variation of
                 processor activity as workloads execute, but when left
                 unattended, these voltage fluctuations can lead to
                 timing violations or even transistor lifetime issues.
                 In this article, we present a hardware--software
                 collaborative approach to mitigate voltage
                 fluctuations. A checkpoint-recovery mechanism rectifies
                 errors when voltage violates maximum tolerance
                 settings, while a runtime software layer reschedules
                 the program's instruction stream to prevent recurring
                 violations at the same program location. The runtime
                 layer, combined with the proposed code-rescheduling
                 algorithm, removes 60\% of all violations with minimal
                 overhead, thereby significantly improving overall
                 performance. Our solution is a radical departure from
                 the ongoing industry-standard approach to circumvent
                 the issue altogether by optimizing for the worst-case
                 voltage flux, which compromises power and performance
                 efficiency severely, especially looking ahead to future
                 technology generations. Existing conservative
                 approaches will have severe implications on the ability
                 to deliver efficient microprocessors. The proposed
                 technique reassembles a traditional reliability problem
                 as a runtime performance optimization problem, thus
                 allowing us to design processors for typical case
                 operation by building intelligent algorithms that can
                 prevent recurring violations.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
  keywords =     "dI/dt; inductive noise; voltage emergencies; Voltage
                 noise",
}

@Article{Zhao:2010:PPP,
  author =       "Qin Zhao and Ioana Cutcutache and Weng-Fai Wong",
  title =        "{PiPA}: {Pipelined} profiling and analysis on
                 multicore systems",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880037.1880038",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Profiling and online analysis are important tasks in
                 program understanding and feedback-directed
                 optimization. However, fine-grained profiling and
                 online analysis tend to seriously slow down the
                 application. To cope with the slowdown, one may have to
                 terminate the process early or resort to sampling. The
                 former tends to distort the result because of warm-up
                 effects. The latter runs the risk of missing important
                 effects because sampling was turned off during the time
                 that these effects appeared. A promising approach is to
                 make use of the parallel processing capabilities of the
                 now ubiquitous multicore processors to speed up the
                 profiling and analysis process.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Guo:2010:QSS,
  author =       "Fei Guo and Yan Solihin and Li Zhao and Ravishankar
                 Iyer",
  title =        "Quality of service shared cache management in chip
                 multiprocessor architecture",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880037.1880039",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The trends in enterprise IT toward service-oriented
                 computing, server consolidation, and virtual computing
                 point to a future in which workloads are becoming
                 increasingly diverse in terms of performance,
                 reliability, and availability requirements. It can be
                 expected that more and more applications with diverse
                 requirements will run on a Chip Multi-Processor (CMP)
                 and share platform resources such as the lowest level
                 cache and off-chip bandwidth. In this environment, it
                 is desirable to have microarchitecture and software
                 support that can provide a guarantee of a certain level
                 of performance, which we refer to as performance
                 Quality of Service. In this article, we investigated a
                 framework would be needed to manage the shared cache
                 resource for fully providing QoS in a CMP.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wu:2010:DEH,
  author =       "Xiaoxia Wu and Jian Li and Lixin Zhang and Evan
                 Speight and Ram Rajamony and Yuan Xie",
  title =        "Design exploration of hybrid caches with disparate
                 memory technologies",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880037.1880040",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Traditional multilevel SRAM-based cache hierarchies,
                 especially in the context of chip multiprocessors
                 (CMPs), present many challenges in area requirements,
                 core--to--cache balance, power consumption, and design
                 complexity. New advancements in technology enable
                 caches to be built from other technologies, such as
                 Embedded DRAM (EDRAM), Magnetic RAM (MRAM), and
                 Phase-change RAM (PRAM), in both 2D chips or 3D stacked
                 chips. Caches fabricated in these technologies offer
                 dramatically different power-performance
                 characteristics when compared with SRAM-based caches,
                 particularly in the areas of access latency, cell
                 density, and overall power consumption. In this
                 article, we propose to take advantage of the best
                 characteristics that each technology has to offer
                 through the use of Hybrid Cache Architecture (HCA)
                 designs.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kourtis:2010:ECO,
  author =       "Kornilios Kourtis and Georgios Goumas and Nectarios
                 Koziris",
  title =        "Exploiting compression opportunities to improve
                 {SpMxV} performance on shared memory systems",
  journal =      j-TACO,
  volume =       "7",
  number =       "3",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880037.1880041",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The Sparse Matrix-Vector Multiplication (SpMxV) kernel
                 exhibits poor scaling on shared memory systems, due to
                 the streaming nature of its data access pattern. To
                 decrease memory contention and improve kernel
                 performance we propose two compression schemes: CSR-DU,
                 that targets the reduction of the matrix structural
                 data by applying coarse-grained delta-encoding, and
                 CSR-VI, that targets the reduction of the values using
                 indirect indexing, applicable to matrices with a small
                 number of unique values.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Buyukkurt:2010:IHL,
  author =       "Betul Buyukkurt and John Cortes and Jason Villarreal
                 and Walid A. Najjar",
  title =        "Impact of high-level transformations within the
                 {ROCCC} framework",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880043.1880044",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hwang:2010:DCR,
  author =       "Yuan-Shin Hwang and Tzong-Yen Lin and Rong-Guey
                 Chang",
  title =        "{DisIRer}: {Converting} a retargetable compiler into a
                 multiplatform binary translator",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880043.1880045",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Boyer:2010:FBP,
  author =       "Michael Boyer and David Tarjan and Kevin Skadron",
  title =        "Federation: {Boosting} per-thread performance of
                 throughput-oriented manycore architectures",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880043.1880046",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fursin:2010:COP,
  author =       "Grigori Fursin and Olivier Temam",
  title =        "Collective optimization: a practical collaborative
                 approach",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880043.1880047",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2010:UBI,
  author =       "Fang Liu and Yan Solihin",
  title =        "Understanding the behavior and implications of context
                 switch misses",
  journal =      j-TACO,
  volume =       "7",
  number =       "4",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1880043.1880048",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Mon Jan 10 09:37:16 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Eyerman:2011:FGD,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Fine-grained {DVFS} using on-chip regulators",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1952998.1952999",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Limit studies on Dynamic Voltage and Frequency Scaling
                 (DVFS) provide apparently contradictory conclusions. On
                 the one hand early limit studies report that DVFS is
                 effective at large timescales (on the order of
                 million(s) of cycles) with large scaling overheads (on
                 the order of tens of microseconds), and they conclude
                 that there is no need for small overhead DVFS at small
                 timescales. Recent work on the other hand --- motivated
                 by the surge of on-chip voltage regulator research ---
                 explores the potential of fine-grained DVFS and reports
                 substantial energy savings at timescales of hundreds of
                 cycles (while assuming no scaling overhead). This
                 article unifies these apparently contradictory
                 conclusions through a DVFS limit study that
                 simultaneously explores timescale and scaling speed.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cher:2011:EEC,
  author =       "Chen-Yong Cher and Eren Kursun",
  title =        "Exploring the effects of on-chip thermal variation on
                 high-performance multicore architectures",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1952998.1953000",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Inherent temperature variation among cores in a
                 multicore architecture can be caused by a number of
                 factors including process variation, cooling and
                 packaging imperfections, and even placement of the chip
                 in the module. Current dynamic thermal management
                 techniques assume identical heating profiles for
                 homogeneous multicore architectures. Our experimental
                 results indicate that inherent thermal variation is
                 very common in existing multicores. While most
                 multicore chips accommodate multiple thermal sensors,
                 the dynamic power/thermal management schemes are
                 oblivious of the inherent heating tendencies. Hence, in
                 the case of variation, the chip faces repetitive
                 hotspots running on such cores. In this article, we
                 propose a technique that leverages the on-chip sensor
                 infrastructure as well as the capabilities of
                 power/thermal management to effectively reduce the
                 heating and minimize local hotspots.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wu:2011:ATR,
  author =       "Carole-Jean Wu and Margaret Martonosi",
  title =        "Adaptive timekeeping replacement: Fine-grained
                 capacity management for shared {CMP} caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1952998.1953001",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In chip multiprocessors (CMPs), several
                 high-performance cores typically compete for capacity
                 in a shared last-level cache. This causes degraded and
                 unpredictable memory performance for multiprogrammed
                 and parallel workloads. In response, recent schemes
                 apportion cache bandwidth and capacity in ways that
                 offer better aggregate performance for the workloads.
                 These schemes, however, focus primarily on relatively
                 coarse-grained capacity management without concern for
                 operating system process priority levels. In this work,
                 we explore capacity management approaches that are both
                 temporally and spatially more fine-grained than prior
                 work. We also consider operating system priority levels
                 as part of capacity management. We propose a capacity
                 management mechanism based on timekeeping techniques
                 that track the time interval since the last access to
                 cached data.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vespa:2011:DFA,
  author =       "Lucas Vespa and Ning Weng",
  title =        "Deterministic finite automata characterization and
                 optimization for scalable pattern matching",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1952998.1953002",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory-based Deterministic Finite Automata (DFA) are
                 ideal for pattern matching in network intrusion
                 detection systems due to their deterministic
                 performance and ease of update of new patterns, however
                 severe DFA memory requirements make it impractical to
                 implement thousands of patterns. This article aims to
                 understand the basic relationship between DFA
                 characteristics and memory requirements, and to design
                 a practical memory-based pattern matching engine. We
                 present a methodology that consists of theoretical DFA
                 characterization, encoding optimization, and
                 implementation architecture. Results show the validity
                 of the characterization metrics, effectiveness of the
                 encoding techniques, and efficiency of the memory-based
                 pattern engines.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bhattacharjee:2011:PLC,
  author =       "Abhishek Bhattacharjee and Gilberto Contreras and
                 Margaret Martonosi",
  title =        "Parallelization libraries: Characterizing and reducing
                 overheads",
  journal =      j-TACO,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1952998.1953003",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Wed Apr 27 07:54:03 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Creating efficient, scalable dynamic parallel runtime
                 systems for chip multiprocessors (CMPs) requires
                 understanding the overheads that manifest at high core
                 counts and small task sizes. In this article, we assess
                 these overheads on Intel's Threading Building Blocks
                 (TBB) and OpenMP. First, we use real hardware and
                 simulations to detail various scheduler and
                 synchronization overheads. We find that these can
                 amount to 47\% of TBB benchmark runtime and 80\% of
                 OpenMP benchmark runtime. Second, we propose load
                 balancing techniques such as occupancy-based and
                 criticality-guided task stealing, to boost performance.
                 Overall, our study provides valuable insights for
                 creating robust, scalable runtime libraries.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dong:2011:HCU,
  author =       "Xiangyu Dong and Yuan Xie and Naveen Muralimanohar and
                 Norman P. Jouppi",
  title =        "Hybrid checkpointing using emerging nonvolatile
                 memories for future exascale systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1970386.1970387",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The scalability of future Massively Parallel
                 Processing (MPP) systems is being severely challenged
                 by high failure rates. Current centralized Hard Disk
                 Drive (HDD) checkpointing results in overhead of 25\%
                 or more at petascale. Since systems become more
                 vulnerable as the node count keeps increasing, novel
                 techniques that enable fast and frequent checkpointing
                 are critical to the future exascale system
                 implementation. In this work, we first introduce one of
                 the emerging nonvolatile memory technologies,
                 Phase-Change Random Access Memory (PCRAM), as a proper
                 candidate of the fast checkpointing device. After a
                 thorough analysis of MPP systems, failure rates and
                 failure sources, we propose a PCRAM-based hybrid
                 local/global checkpointing mechanism which not only
                 provides a faster checkpoint storage, but also boosts
                 the effectiveness of other orthogonal techniques such
                 as incremental checkpointing and background
                 checkpointing.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2011:EEM,
  author =       "Jianjun Li and Chenggang Wu and Wei-Chung Hsu",
  title =        "Efficient and effective misaligned data access
                 handling in a dynamic binary translation system",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1970386.1970388",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Binary Translation (BT) has been commonly used to
                 migrate application software across Instruction Set
                 Architectures (ISAs). Some architectures, such as X86,
                 allow Misaligned Data Accesses (MDAs), while most
                 modern architectures require natural data alignments.
                 In a binary translation system, where the source ISA
                 allows MDA and the target ISA does not, memory
                 operations must be carefully translated. Naive
                 translation may cause frequent misaligned data access
                 traps to occur at runtime on the target machine and
                 severely slow down the migrated application. This
                 article evaluates different approaches in handling MDA
                 in a binary translation system including how to
                 identify MDA candidates and how to translate such
                 memory instructions.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Venkataramani:2011:DDS,
  author =       "Guru Venkataramani and Christopher J. Hughes and
                 Sanjeev Kumar and Milos Prvulovic",
  title =        "{DeFT}: Design space exploration for on-the-fly
                 detection of coherence misses",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1970386.1970389",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "While multicore processors promise large performance
                 benefits for parallel applications, writing these
                 applications is notoriously difficult. Tuning a
                 parallel application to achieve good performance, also
                 known as performance debugging, is often more
                 challenging than debugging the application for
                 correctness. Parallel programs have many
                 performance-related issues that are not seen in
                 sequential programs. An increase in cache misses is one
                 of the biggest challenges that programmers face. To
                 minimize these misses, programmers must not only
                 identify the source of the extra misses, but also
                 perform the tricky task of determining if the misses
                 are caused by interthread communication (i.e.,
                 coherence misses) and if so, whether they are caused by
                 true or false sharing (since the solutions for these
                 two are quite different).",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hiser:2011:EIB,
  author =       "Jason D. Hiser and Daniel W. Williams and Wei Hu and
                 Jack W. Davidson and Jason Mars and Bruce R. Childers",
  title =        "Evaluating indirect branch handling mechanisms in
                 software dynamic translation systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1970386.1970390",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  bibdate =      "Fri Jun 17 18:32:40 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Software Dynamic Translation (SDT) is used for
                 instrumentation, optimization, security, and many other
                 uses. A major source of SDT overhead is the execution
                 of code to translate an indirect branch's target
                 address into the translated destination block's
                 address. This article discusses sources of Indirect
                 Branch (IB) overhead in SDT systems and evaluates
                 techniques for overhead reduction. Measurements using
                 SPEC CPU2000 show that the appropriate choice and
                 configuration of IB translation mechanisms can
                 significantly reduce the overhead. Further,
                 cross-architecture evaluation of these mechanisms
                 reveals that the most efficient implementation and
                 configuration can be highly dependent on the
                 architecture implementation.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2011:HAM,
  author =       "Xi E. Chen and Tor M. Aamodt",
  title =        "Hybrid analytical modeling of pending cache hits, data
                 prefetching, and {MSHRs}",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019609",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes techniques to predict the
                 performance impact of pending cache hits, hardware
                 prefetching, and miss status holding register resources
                 on superscalar microprocessors using hybrid analytical
                 models. The proposed models focus on timeliness of
                 pending hits and prefetches and account for a limited
                 number of MSHRs. They improve modeling accuracy of
                 pending hits by 3.9{\times} and when modeling data
                 prefetching, a limited number of MSHRs, or both, these
                 techniques result in average errors of 9.5\% to
                 17.8\%. The impact of non-uniform DRAM memory latency
                 is shown to be approximated well by using a moving
                 average of memory access latency.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kleanthous:2011:CMD,
  author =       "Marios Kleanthous and Yiannakis Sazeides",
  title =        "{CATCH}: a mechanism for dynamically detecting
                 cache-content-duplication in instruction caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019610",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Cache-content-duplication (CCD) occurs when there is a
                 miss for a block in a cache and the entire content of
                 the missed block is already in the cache in a block
                 with a different tag. Caches aware of
                 content-duplication can have lower miss penalty by
                 fetching, on a miss to a duplicate block, directly from
                 the cache instead of accessing lower in the memory
                 hierarchy, and can have lower miss rates by allowing
                 only blocks with unique content to enter a cache. This
                 work examines the potential of CCD for instruction
                 caches. We show that CCD is a frequent phenomenon and
                 that an idealized duplication-detection mechanism for
                 instruction caches has the potential to increase
                 performance of an out-of-order processor, with a 16KB,
                 8-way, 8 instructions per block instruction cache,
                 often by more than 10\% and up to 36\%.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vandierendonck:2011:MSR,
  author =       "Hans Vandierendonck and Andr{\'e} Seznec",
  title =        "Managing {SMT} resource usage through speculative
                 instruction window weighting",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019611",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simultaneous multithreading processors dynamically
                 share processor resources between multiple threads. In
                 general, shared SMT resources may be managed
                 explicitly, for instance, by dynamically setting queue
                 occupation bounds for each thread as in the DCRA and
                 Hill-Climbing policies. Alternatively, resources may be
                 managed implicitly; that is, resource usage is
                 controlled by placing the desired instruction mix in
                 the resources. In this case, the main resource
                 management tool is the instruction fetch policy which
                 must predict the behavior of each thread (branch
                 mispredictions, long-latency loads, etc.) as it fetches
                 instructions. In this article, we present the use of
                 Speculative Instruction Window Weighting (SIWW) to
                 bridge the gap between implicit and explicit SMT fetch
                 policies.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2011:PGS,
  author =       "Po-Han Wang and Chia-Lin Yang and Yen-Ming Chen and
                 Yu-Jung Cheng",
  title =        "Power gating strategies on {GPUs}",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019612",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As technology continues to shrink, reducing leakage is
                 critical to achieving energy efficiency. Previous
                 studies on low-power GPUs (Graphics Processing Units)
                 focused on techniques for dynamic power reduction, such
                 as DVFS (Dynamic Voltage and Frequency Scaling) and
                 clock gating. In this paper, we explore the potential
                 of adopting architecture-level power gating techniques
                 for leakage reduction on GPUs. We propose three
                 strategies for applying power gating on different
                 modules in GPUs. The Predictive Shader Shutdown
                 technique exploits workload variation across frames to
                 eliminate leakage in shader clusters. Deferred Geometry
                 Pipeline seeks to minimize leakage in fixed-function
                 geometry units by utilizing an imbalance between
                 geometry and fragment computation across batches.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Feng:2011:DAD,
  author =       "Min Feng and Chen Tian and Changhui Lin and Rajiv
                 Gupta",
  title =        "Dynamic access distance driven cache replacement",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019613",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we propose a new cache replacement
                 policy that makes the replacement decision based on the
                 reuse information of the cache lines and the requested
                 data. We present the architectural support and evaluate
                 the performance of our approach using SPEC benchmarks.
                 We also develop two reuse information predictors: a
                 profile-based static predictor and a runtime predictor.
                 The applicability of each predictor is discussed in
                 this paper. We further extend our reuse information
                 predictors so that the cache can adaptively choose
                 between the reuse information based replacement policy
                 and an approximation of LRU policy. According to the
                 experimental results, our adaptive reuse information
                 based replacement policy performs either better than or
                 close to the LRU policy.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Samih:2011:EPP,
  author =       "Ahmad Samih and Yan Solihin and Anil Krishna",
  title =        "Evaluating placement policies for managing capacity
                 sharing in {CMP} architectures with private caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019614",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Chip Multiprocessors (CMP) with distributed L2 caches
                 suffer from a cache fragmentation problem; some caches
                 may be overutilized while others may be underutilized.
                 To avoid such fragmentation, researchers have proposed
                 capacity sharing mechanisms where applications that
                 need additional cache space can place their victim
                 blocks in remote caches. However, we found that only
                 allowing victim blocks to be placed on remote caches
                 tends to cause a high number of remote cache hits
                 relative to local cache hits. In this article, we show
                 that many of the remote cache hits can be converted
                 into local cache hits if we allow newly fetched blocks
                 to be selectively placed directly in a remote cache,
                 rather than in the local cache.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yeh:2011:MPP,
  author =       "Chang-Ching Yeh and Kuei-Chung Chang and Tien-Fu Chen
                 and Chingwei Yeh",
  title =        "Maintaining performance on power gating of
                 microprocessor functional units by using a predictive
                 pre-wakeup strategy",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019615",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Power gating is an effective technique for reducing
                 leakage power in deep submicron CMOS technology.
                 Microarchitectural techniques for power gating of
                 functional units have been developed by detecting
                 suitable idle regions and turning them off to reduce
                 leakage energy consumption; however, wakeup of
                 functional units is needed when instructions are ready
                 for execution such that wakeup overhead is naturally
                 incurred. This study presents time-based power gating
                 with reference pre-wakeup (PGRP), a novel predictive
                 strategy that detects suitable idle periods for power
                 gating and then enables pre-wakeup of needed functional
                 units for avoiding wakeup overhead. The key insight is
                 that most wakeups are repeated due to program
                 locality.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2011:DDE,
  author =       "Hyunjin Lee and Sangyeun Cho and Bruce R. Childers",
  title =        "{DEFCAM}: a design and evaluation framework for
                 defect-tolerant cache memories",
  journal =      j-TACO,
  volume =       "8",
  number =       "3",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2019608.2019616",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Oct 22 09:15:12 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Advances in deep submicron technology call for a
                 careful review of existing cache designs and design
                 practices in terms of yield, area, and performance.
                 This article presents a Design and Evaluation Framework
                 for defect-tolerant Cache Memories (DEFCAM), which
                 enables processor architects to consider yield, area,
                 and performance together in a unified framework. Since
                 there is a complex, changing trade-off among these
                 metrics depending on the technology, the cache
                 organization, and the yield enhancement scheme
                 employed, such a design flow is invaluable to processor
                 architects when they assess a design and explore the
                 design space quickly at an early stage. We develop a
                 complete framework supporting the proposed DEFCAM
                 design flow, from injecting defects into a wafer to
                 evaluating program performance of individual processors
                 on the wafer.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stenstrom:2012:ISI,
  author =       "Per Stenstr{\"o}m and Koen {De Bosschere}",
  title =        "Introduction to the special issue on high-performance
                 and embedded architectures and compilers",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086697",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Albericio:2012:ALC,
  author =       "Jorge Albericio and Rub{\'e}n Gran and Pablo
                 Ib{\'a}{\~n}ez and V{\'\i}ctor Vi{\~n}als and Jose
                 Mar{\'\i}a Llaber{\'\i}a",
  title =        "{ABS}: a low-cost adaptive controller for
                 prefetching in a banked shared last-level cache",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "19:1--19:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086698",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hardware data prefetch is a very well known technique
                 for hiding memory latencies. However, in a multicore
                 system fitted with a shared Last-Level Cache (LLC),
                 prefetch induced by a core consumes common resources
                 such as shared cache space and main memory bandwidth.
                 This may degrade the performance of other cores and
                 even the overall system performance unless the prefetch
                 aggressiveness of each core is controlled from a system
                 standpoint. On the other hand, LLCs in commercial chip
                 multiprocessors are more and more frequently organized
                 in independent banks. In this contribution, we target
                 for the first time prefetch in a banked LLC
                 organization and propose ABS, a low-cost controller
                 with a hill-climbing approach that runs stand-alone at
                 each LLC bank without requiring inter-bank
                 communication.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bayrak:2012:AII,
  author =       "Ali Galip Bayrak and Nikola Velickovic and Paolo Ienne
                 and Wayne Burleson",
  title =        "An architecture-independent instruction shuffler to
                 protect against side-channel attacks",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "20:1--20:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086699",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Embedded cryptographic systems, such as smart cards,
                 require secure implementations that are robust to a
                 variety of low-level attacks. Side-Channel Attacks
                 (SCA) exploit the information such as power
                 consumption, electromagnetic radiation and acoustic
                 leaking through the device to uncover the secret
                 information. Attackers can mount successful attacks
                 with very modest resources in a short time period.
                 Therefore, many methods have been proposed to increase
                 the security against SCA. Randomizing the execution
                 order of the instructions that are independent, i.e.,
                 random shuffling, is one of the most popular among
                 them. Implementing instruction shuffling in software is
                 either implementation specific or has a significant
                 performance or code size overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Demme:2012:AGC,
  author =       "John Demme and Simha Sethumadhavan",
  title =        "Approximate graph clustering for program
                 characterization",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "21:1--21:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086700",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "An important aspect of system optimization research is
                 the discovery of program traits or behaviors. In this
                 paper, we present an automated method of program
                 characterization which is able to examine and cluster
                 program graphs, i.e., dynamic data graphs or control
                 flow graphs. Our novel approximate graph clustering
                 technology allows users to find groups of program
                 fragments which contain similar code idioms or patterns
                 in data reuse, control flow, and context. Patterns of
                 this nature have several potential applications
                 including development of new static or dynamic
                 optimizations to be implemented in software or in
                 hardware. For the SPEC CPU 2006 suite of benchmarks,
                 our results show that approximate graph clustering is
                 effective at grouping behaviorally similar functions.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pricopi:2012:BPH,
  author =       "Mihai Pricopi and Tulika Mitra",
  title =        "{Bahurupi}: a polymorphic heterogeneous multi-core
                 architecture",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "22:1--22:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086701",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Computing systems have made an irreversible transition
                 towards parallel architectures with the emergence of
                 multi-cores. Moreover, power and thermal limits in
                 embedded systems mandate the deployment of many simpler
                 cores rather than a few complex cores on chip. Consumer
                 electronic devices, on the other hand, need to support
                 an ever-changing set of diverse applications with
                 varying performance demands. While some applications
                 can benefit from thread-level parallelism offered by
                 multi-core solutions, there still exist a large number
                 of applications with substantial amount of sequential
                 code. The sequential programs suffer from limited
                 exploitation of instruction-level parallelism in simple
                 cores. We propose a reconfigurable multi-core
                 architecture, called Bahurupi, that can successfully
                 reconcile the conflicting demands of instruction-level
                 and thread-level parallelism.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cleemput:2012:CMT,
  author =       "Jeroen V. Cleemput and Bart Coppens and Bjorn {De
                 Sutter}",
  title =        "Compiler mitigations for time attacks on modern x86
                 processors",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086702",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This paper studies and evaluates the extent to which
                 automated compiler techniques can defend against
                 timing-based side channel attacks on modern x86
                 processors. We study how modern x86 processors can leak
                 timing information through side channels that relate to
                 data flow. We study the efficiency, effectiveness,
                 portability, predictability and sensitivity of several
                 mitigating code transformations that eliminate or
                 minimize key-dependent execution time variations.
                 Furthermore, we discuss the extent to which compiler
                 backends are a suitable tool to provide automated
                 support for the proposed mitigations.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mccandless:2012:CTI,
  author =       "Jason Mccandless and David Gregg",
  title =        "Compiler techniques to improve dynamic branch
                 prediction for indirect jump and call instructions",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086703",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Indirect jump instructions are used to implement
                 multiway branch statements and virtual function calls
                 in object-oriented languages. Branch behavior can have
                 significant impact on program performance, but
                 fortunately hardware predictors can alleviate much of
                 the risk. Modern processors include indirect branch
                 predictors which use part of the target address to
                 update a global history. We present a code generation
                 technique to maximize the branch history information
                 available to the predictor. We implement our
                 optimization as an assembly language transformation,
                 and evaluate it for SPEC benchmarks and interpreters
                 using simulated and real hardware, showing indirect
                 branch misprediction decreases.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Garcia-Guirado:2012:DDA,
  author =       "Antonio Garc{\'\i}a-Guirado and Ricardo
                 Fern{\'a}ndez-Pascual and Alberto Ros and Jos{\'e}
                 M. Garc{\'\i}a",
  title =        "{DAPSCO}: Distance-aware partially shared cache
                 organization",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "25:1--25:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086704",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many-core tiled CMP proposals often assume a partially
                 shared last level cache (LLC) since this provides a
                 good compromise between access latency and cache
                 utilization. In this paper, we propose a novel way to
                 map memory addresses to LLC banks that takes into
                 account the average distance between the banks and the
                 tiles that access them. Contrary to traditional
                 approaches, our mapping does not group the tiles in
                 clusters within which all the cores access the same
                 bank for the same addresses. Instead, two neighboring
                 cores access different sets of banks minimizing the
                 average distance travelled by the cache requests.
                 Results for a 64-core CMP show that our proposal
                 improves both execution time and the energy consumed by
                 the network by 13\% when compared to a traditional
                 mapping.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2012:FSS,
  author =       "Zhenjiang Wang and Chenggang Wu and Pen-Chung Yew and
                 Jianjun Li and Di Xu",
  title =        "On-the-fly structure splitting for heap objects",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086705",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the advent of multicore systems, the gap between
                 processor speed and memory latency has grown worse
                 because of their complex interconnect. Sophisticated
                 techniques are needed more than ever to improve an
                 application's spatial and temporal locality. This paper
                 describes an optimization that aims to improve heap
                 data layout by structure-splitting. It also provides
                 runtime address checking by piggybacking on the
                 existing page protection mechanism to guarantee the
                 correctness of such optimization that has eluded many
                 previous attempts due to safety concerns. The technique
                 can be applied to both sequential and parallel programs
                 at either compile time or runtime. However, we focus
                 primarily on sequential programs (i.e., single-threaded
                 programs) at runtime in this paper.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Das:2012:ELC,
  author =       "Dibyendu Das and B. Dupont {De Dinechin} and
                 Ramakrishna Upadrasta",
  title =        "Efficient liveness computation using merge sets and
                 {DJ}-graphs",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "27:1--27:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086706",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this work we devise an efficient algorithm that
                 computes the liveness information of program variables.
                 The algorithm employs SSA form and DJ-graphs as
                 representation to build Merge sets. The Merge set of
                 node n, M(n) is based on the structure of the Control
                 Flow Graph (CFG) and consists of all nodes where a
                 {\phi}-function needs to be placed, if a definition of
                 a variable appears in n. The merge sets of a CFG can be
                 computed using DJ-graphs without prior knowledge of how
                 the variables are used and defined. Later, we can
                 answer the liveness query (as a part of other
                 optimization or analysis phase) by utilizing the
                 knowledge of the use/def of variables, the dominator
                 tree and the pre-computed merge sets.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Patsilaras:2012:EEM,
  author =       "George Patsilaras and Niket K. Choudhary and James
                 Tuck",
  title =        "Efficiently exploiting memory level parallelism on
                 asymmetric coupled cores in the dark silicon era",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "28:1--28:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086707",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Extracting high memory-level parallelism (MLP) is
                 essential for speeding up single-threaded applications
                 which are memory bound. At the same time, the projected
                 amount of dark silicon (the fraction of the chip
                 powered off) on a chip is growing. Hence, Asymmetric
                 Multicore Processors (AMP) offer a unique opportunity
                 to integrate many types of cores, each powered at
                 different times, in order to optimize for different
                 regions of execution. In this work, we quantify the
                 potential for exploiting core customization to speedup
                 programs during regions of high MLP. Based on a careful
                 design space exploration, we discover that an AMP that
                 includes a narrow and fast specialized core has the
                 potential to efficiently exploit MLP.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Malits:2012:ELG,
  author =       "Roman Malits and Evgeny Bolotin and Avinoam Kolodny
                 and Avi Mendelson",
  title =        "Exploring the limits of {GPGPU} scheduling in control
                 flow bound applications",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086708",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPGPUs are optimized for graphics, for that reason the
                 hardware is optimized for massively data parallel
                 applications characterized by predictable memory access
                 patterns and little control flow. For such
                 applications' e.g., matrix multiplication, GPGPU based
                 system can achieve very high performance. However, many
                 general purpose data parallel applications are
                 characterized as having intensive control flow and
                 unpredictable memory access patterns. Optimizing the
                 code in such problems for current hardware is often
                 ineffective and even impractical since it exhibits low
                 hardware utilization leading to relatively low
                 performance. This work tracks the root causes of
                 execution inefficacies when running control flow
                 intensive CUDA applications on NVIDIA GPGPU hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Orosa:2012:FIF,
  author =       "Lois Orosa and Elisardo Antelo and Javier D.
                 Bruguera",
  title =        "{FlexSig}: {Implementing} flexible hardware
                 signatures",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "30:1--30:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086709",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the advent of chip multiprocessors, new
                 techniques have been developed to make parallel
                 programming easier and more reliable. New parallel
                 programming paradigms and new methods of making the
                 execution of programs more efficient and more reliable
                 have been developed. Usually, these improvements
                 require hardware support to avoid a system slowdown.
                 Signatures based on Bloom filters are widely used as
                 hardware support for parallel programming in chip
                 multiprocessors. Signatures are used in Transactional
                 Memory, thread-level speculation, parallel debugging,
                 deterministic replay and other tools and applications.
                 The main limitation of hardware signatures is the lack
                 of flexibility: if signatures are designed with a given
                 configuration, tailored to the requirements of a
                 specific tool or application, it is likely that they do
                 not fit well for other different requirements.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Titos-Gil:2012:HTM,
  author =       "Ruben Titos-Gil and Manuel E. Acacio and Jose M.
                 Garcia and Tim Harris and Adrian Cristal and Osman
                 Unsal and Ibrahim Hur and Mateo Valero",
  title =        "Hardware transactional memory with software-defined
                 conflicts",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "31:1--31:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086710",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this paper we investigate the benefits of turning
                 the concept of transactional conflict from its
                 traditionally fixed definition into a variable one that
                 can be dynamically controlled in software. We propose
                 the extension of the atomic language construct with an
                 attribute that specifies the definition of conflict, so
                 that programmers can write code which adjusts what
                 kinds of conflicts are to be detected, relaxing or
                 tightening the conditions according to the forms of
                 interference that can be tolerated by a particular
                 algorithm. Using this performance-motivated construct,
                 specific conflict information can be associated with
                 portions of code, as each transaction is provided with
                 a local definition that applies while it executes.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kim:2012:IPN,
  author =       "Yongjoo Kim and Jongeun Lee and Toan X. Mai and
                 Yunheung Paek",
  title =        "Improving performance of nested loops on
                 reconfigurable array processors",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "32:1--32:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086711",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Pipelining algorithms are typically concerned with
                 improving only the steady-state performance, or the
                 kernel time. The pipeline setup time happens only once
                 and therefore can be negligible compared to the kernel
                 time. However, for Coarse-Grained Reconfigurable
                 Architectures (CGRAs) used as a coprocessor to a main
                 processor, pipeline setup can take much longer due to
                 the communication delay between the two processors, and
                 can become significant if it is repeated in an outer
                 loop of a loop nest. In this paper we evaluate the
                 overhead of such non-kernel execution times when
                 mapping nested loops for CGRAs, and propose a novel
                 architecture-compiler cooperative scheme to reduce the
                 overhead, while also minimizing the number of extra
                 configurations required.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Purnaprajna:2012:MWI,
  author =       "Madhura Purnaprajna and Paolo Ienne",
  title =        "Making wide-issue {VLIW} processors viable on
                 {FPGAs}",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086712",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Soft and highly-customized processors are emerging as
                 a common way to efficiently control large amount of
                 computing resources available on FPGAs. Yet, some
                 processor architectures of choice for DSP and media
                 applications, such as wide-issue VLIW processors,
                 remain impractical: the multi-ported register file
                 makes a very inefficient use of the resources in the
                 FPGA fabric. This paper proposes modifications to
                 existing FPGAs to make soft-VLIW processor viable. We
                 introduce an embedded multi-ported RAM that can be
                 customized to match the issue-width of VLIW processors.
                 To ascertain the benefits of this approach, we map an
                 extensible VLIW processor onto a standard FPGA from
                 Xilinx.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Radojkovic:2012:EIS,
  author =       "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud
                 Grasset and Eduardo Qui{\~n}ones and Sami Yehia and
                 Francisco J. Cazorla",
  title =        "On the evaluation of the impact of shared resources in
                 multithreaded {COTS} processors in time-critical
                 environments",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086713",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Commercial Off-The-Shelf (COTS) processors are now
                 commonly used in real-time embedded systems. The
                 characteristics of these processors fulfill system
                 requirements in terms of time-to-market, low cost, and
                 high performance-per-watt ratio. However, multithreaded
                 (MT) processors are still not widely used in real-time
                 systems because the timing analysis is too complex. In
                 MT processors, simultaneously-running tasks share and
                 compete for processor resources, so the timing analysis
                 has to estimate the possible impact that the inter-task
                 interferences have on the execution time of the
                 applications. In this paper, we propose a method that
                 quantifies the slowdown that simultaneously-running
                 tasks may experience due to collision in shared
                 processor resources.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Domnitser:2012:NMC,
  author =       "Leonid Domnitser and Aamer Jaleel and Jason Loew and
                 Nael Abu-Ghazaleh and Dmitry Ponomarev",
  title =        "Non-monopolizable caches: Low-complexity mitigation of
                 cache side channel attacks",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086714",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose a flexibly-partitioned cache design that
                 either drastically weakens or completely eliminates
                 cache-based side channel attacks. The proposed
                 Non-Monopolizable (NoMo) cache dynamically reserves
                 cache lines for active threads and prevents other
                 co-executing threads from evicting reserved lines.
                 Unreserved lines remain available for dynamic sharing
                 among threads. NoMo requires only simple modifications
                 to the cache replacement logic, making it
                 straightforward to adopt. It requires no software
                 support enabling it to automatically protect
                 pre-existing binaries. NoMo results in performance
                 degradation of about 1\% on average. We demonstrate
                 that NoMo can provide strong security guarantees for
                 the AES and Blowfish encryption algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rico:2012:SLS,
  author =       "Alejandro Rico and Felipe Cabarcas and Carlos
                 Villavieja and Milan Pavlovic and Augusto Vega and Yoav
                 Etsion and Alex Ramirez and Mateo Valero",
  title =        "On the simulation of large-scale architectures using
                 multiple application abstraction levels",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086715",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simulation is a key tool for computer architecture
                 research. In particular, cycle-accurate simulators are
                 extremely important for microarchitecture exploration
                 and detailed design decisions, but they are slow and,
                 so, not suitable for simulating large-scale
                 architectures, nor are they meant for this. Moreover,
                 microarchitecture design decisions are irrelevant, or
                 even misleading, for early processor design stages and
                 high-level explorations. This allows one to raise the
                 abstraction level of the simulated architecture, and
                 also the application abstraction level, as it does not
                 necessarily have to be represented as an instruction
                 stream. In this paper we introduce a definition of
                 different application abstraction levels, and how these
                 are employed in TaskSim, a multi-core architecture
                 simulator, to provide several architecture modeling
                 abstractions, and simulate large-scale architectures
                 with hundreds of cores.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Saidi:2012:OED,
  author =       "Selma Saidi and Pranav Tendulkar and Thierry Lepley
                 and Oded Maler",
  title =        "Optimizing explicit data transfers for data parallel
                 applications on the {Cell} architecture",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086716",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this paper we investigate a general approach to
                 automate some deployment decisions for a certain class
                 of applications on multi-core computers. We consider
                 data-parallelizable programs that use the well-known
                 double buffering technique to bring the data from the
                 off-chip slow memory to the local memory of the cores
                 via a DMA (direct memory access) mechanism. Based on
                 the computation time and size of elementary data items
                 as well as DMA characteristics, we derive optimal and
                 near optimal values for the number of blocks that
                 should be clustered in a single DMA command. We then
                 extend the results to the case where a computation for
                 one data item needs some data in its neighborhood.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Feng:2012:PPL,
  author =       "Min Feng and Changhui Lin and Rajiv Gupta",
  title =        "{PLDS}: Partitioning linked data structures for
                 parallelism",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086717",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recently, parallelization of computations in the
                 presence of dynamic data structures has shown promising
                 potential. In this paper, we present PLDS, a system for
                 easily expressing and efficiently exploiting
                 parallelism in computations that are based on dynamic
                 linked data structures. PLDS improves the execution
                 efficiency by providing support for data partitioning
                 and then distributing computation across threads based
                 on the partitioning. Such computations often require
                 the use of speculation to exploit dynamic parallelism.
                 PLDS supports a conditional speculation mechanism that
                 reduces the cost of speculation. PLDS can be employed
                 in the context of different forms of parallelism, which
                 to cover a wide range of parallel applications.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pradelle:2012:PPB,
  author =       "Benoit Pradelle and Alain Ketterlin and Philippe
                 Clauss",
  title =        "Polyhedral parallelization of binary code",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086718",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many automatic software parallelization systems have
                 been proposed in the past decades, but most of them are
                 dedicated to source-to-source transformations. This
                 paper shows that parallelizing executable programs is
                 feasible, even if they require complex transformations,
                 and in effect decouples parallelization from
                 compilation, for example, for closed-source or legacy
                 software, where binary code is the only available
                 representation. We propose an automatic parallelizer,
                 which is able to perform advanced parallelization on
                 binary code. It first parses the binary code and
                 extracts high-level information. From this information,
                 a C program is generated. This program captures only a
                 subset of the program semantics, namely, loops and
                 memory accesses.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dong:2012:RAE,
  author =       "Yaozu Dong and Yu Chen and Zhenhao Pan and Jinquan Dai
                 and Yunhong Jiang",
  title =        "{ReNIC}: Architectural extension to {SR-IOV} {I/O}
                 virtualization for efficient replication",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086719",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Virtualization is gaining popularity in cloud
                 computing and has become the key enabling technology in
                 cloud infrastructure. By replicating the virtual server
                 state to multiple independent platforms, virtualization
                 improves the reliability and availability of cloud
                 systems. Unfortunately, existing Virtual Machine (VM)
                 replication solutions were designed only for software
                 virtualized I/O, which suffers from large performance
                 and scalability overheads. Although hardware-assisted
                 I/O virtualization (such as SR-IOV) can achieve close
                 to native performance and very good scalability, they
                 cannot be properly replicated across different physical
                 machines due to architectural limitations (such as lack
                 of efficient device state read/write, buffering
                 outbound packets, etc.) .",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bruintjes:2012:SLA,
  author =       "Tom M. Bruintjes and Karel H. G. Walters and Sabih H.
                 Gerez and Bert Molenkamp and Gerard J. M. Smit",
  title =        "{Sabrewing}: a lightweight architecture for combined
                 floating-point and integer arithmetic",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086720",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In spite of the fact that floating-point arithmetic is
                 costly in terms of silicon area, the joint design of
                 hardware for floating-point and integer arithmetic is
                 seldom considered. While components like multipliers
                 and adders can potentially be shared, floating-point
                 and integer units in contemporary processors are
                 practically disjoint. This work presents a new
                 architecture which tightly integrates floating-point
                 and integer arithmetic in a single datapath. It is
                 mainly intended for use in low-power embedded digital
                 signal processors and therefore the following design
                 constraints were important: limited use of pipelining
                 for the convenience of the compiler; maintaining
                 compatibility with existing technology; minimal area
                 and power consumption for applicability in embedded
                 systems.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kicherer:2012:SPA,
  author =       "Mario Kicherer and Fabian Nowak and Rainer Buchty and
                 Wolfgang Karl",
  title =        "Seamlessly portable applications: Managing the
                 diversity of modern heterogeneous systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086721",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Nowadays, many possible configurations of
                 heterogeneous systems exist, posing several new
                 challenges to application development: different types
                 of processing units usually require individual
                 programming models with dedicated runtime systems and
                 accompanying libraries. If these are absent on an
                 end-user system, e.g. because the respective hardware
                 is not present, an application linked against these
                 will break. This handicaps portability of applications
                 being developed on one system and executed on other,
                 differently configured heterogeneous systems. Moreover,
                 the individual profit of different processing units is
                 normally not known in advance. In this work, we propose
                 a technique to effectively decouple applications from
                 their accelerator-specific parts, respectively code.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Premillieu:2012:SSR,
  author =       "Nathanael Premillieu and Andre Seznec",
  title =        "{SYRANT}: {SYmmetric Resource Allocation on Not-taken
                 and Taken} paths",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086722",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the multicore era, achieving ultimate single
                 process performance is still an issue e.g. for single
                 process workload or for sequential sections in parallel
                 applications. Unfortunately, despite tremendous
                 research effort on branch prediction, substantial
                 performance potential is still wasted due to branch
                 mispredictions. On a branch misprediction resolution,
                 instruction treatment on the wrong path is essentially
                 thrown away. However, in most cases after a conditional
                 branch, the taken and the not-taken paths of execution
                 merge after a few instructions. Instructions that
                 follow the reconvergence point are executed whatever
                 the branch outcome is. We present SYRANT (SYmmetric
                 Resource Allocation on Not-taken and Taken paths), a
                 new technique for exploiting control independence.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hasenplaugh:2012:GBC,
  author =       "William Hasenplaugh and Pritpal S. Ahuja and Aamer
                 Jaleel and Simon Steely Jr. and Joel Emer",
  title =        "The gradient-based cache partitioning algorithm",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086723",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This paper addresses the problem of partitioning a
                 cache between multiple concurrent threads and in the
                 presence of hardware prefetching. Cache replacement
                 designed to preserve temporal locality (e.g., LRU) will
                 allocate cache resources proportional to the miss-rate
                 of each competing thread irrespective of whether the
                 cache space will be utilized [Qureshi and Patt 2006].
                 This is clearly suboptimal as applications vary
                 dramatically in their use of recently accessed data. We
                 address this problem by partitioning a shared cache
                 such that a global goodness metric is optimized. This
                 paper introduces the Gradient-based Cache Partitioning
                 Algorithm (GPA), whose variants optimize either
                 hitrate, total instructions per cycle (IPC) or a
                 weighted IPC metric designed to enforce Quality of
                 Service (QoS) [Iyer 2004].",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lira:2012:MPA,
  author =       "Javier Lira and Timothy M. Jones and Carlos Molina and
                 Antonio Gonz{\'a}lez",
  title =        "The migration prefetcher: Anticipating data promotion
                 in dynamic {NUCA} caches",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086724",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The exponential increase in multicore processor (CMP)
                 cache sizes accompanied by growing on-chip wire delays
                 make it difficult to implement traditional caches with
                 a single, uniform access latency. Non-Uniform Cache
                 Architecture (NUCA) designs have been proposed to
                 address this problem. A NUCA divides the whole cache
                 memory into smaller banks and allows banks nearer a
                 processor core to have lower access latencies than
                 those further away, thus mitigating the effects of the
                 cache's internal wires. Determining the best placement
                 for data in the NUCA cache at any particular moment
                 during program execution is crucial for exploiting the
                 benefits that this architecture provides.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pusukuri:2012:TTD,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
                 Bhuyan",
  title =        "Thread Tranquilizer: Dynamically reducing performance
                 variation",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086725",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To realize the performance potential of multicore
                 systems, we must effectively manage the interactions
                 between memory reference behavior and the operating
                 system policies for thread scheduling and migration
                 decisions. We observe that these interactions lead to
                 significant variations in the performance of a given
                 application, from one execution to the next, even when
                 the program input remains unchanged and no other
                 applications are being run on the system. Our
                 experiments with multithreaded programs, including the
                 TATP database application, SPECjbb2005, and a subset of
                 PARSEC and SPEC OMP programs, on a 24-core Dell
                 PowerEdge R905 server running OpenSolaris confirms the
                 above observation.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhang:2012:TPB,
  author =       "Dongsong Zhang and Deke Guo and Fangyuan Chen and Fei
                 Wu and Tong Wu and Ting Cao and Shiyao Jin",
  title =        "{TL}-plane-based multi-core energy-efficient real-time
                 scheduling algorithm for sporadic tasks",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086726",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As the energy consumption of multi-core systems
                 becomes increasingly prominent, it's a challenge to
                 design an energy-efficient real-time scheduling
                 algorithm in multi-core systems for reducing the system
                 energy consumption while guaranteeing the feasibility
                 of real-time tasks. In this paper, we focus on
                 multi-core processors, with the global Dynamic Voltage
                 Frequency Scaling (DVFS) and Dynamic Power Management
                 (DPM) technologies. In this setting, we propose an
                 energy-efficient real-time scheduling algorithm, the
                 Time Local remaining execution plane based Dynamic
                 Voltage Frequency Scaling (TL-DVFS). TL-DVFS utilizes
                 the concept of Time Local remaining execution (TL)
                 plane to dynamically scale the voltage and frequency of
                 a processor at the initial time of each TL plane as
                 well as at the release time of a sporadic task in each
                 TL plane.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lyons:2012:ASS,
  author =       "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
                 and David Brooks",
  title =        "The accelerator store: a shared memory framework for
                 accelerator-based systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086727",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This paper presents the many-accelerator architecture,
                 a design approach combining the scalability of
                 homogeneous multi-core architectures and
                 system-on-chip's high performance and power-efficient
                 hardware accelerators. In preparation for systems
                 containing tens or hundreds of accelerators, we
                 characterize a diverse pool of accelerators and find
                 each contains significant amounts of SRAM memory (up to
                 90\% of their area). We take advantage of this
                 discovery and introduce the accelerator store, a
                 scalable architectural component to minimize
                 accelerator area by sharing its memories between
                 accelerators. We evaluate the accelerator store for two
                 applications and find significant system area
                 reductions (30\%) in exchange for small overheads (2\%
                 performance, 0\%--8\% energy).",
  acknowledgement = ack-nhfb,
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Orozco:2012:THT,
  author =       "Daniel Orozco and Elkin Garcia and Rishi Khan and
                 Kelly Livingston and Guang R. Gao",
  title =        "Toward high-throughput algorithms on many-core
                 architectures",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086728",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Advanced many-core CPU chips already have a few
                 hundreds of processing cores (e.g., 160 cores in an IBM
                 Cyclops-64 chip) and more and more processing cores
                 become available as computer architecture progresses.
                 The underlying runtime systems of such architectures
                 need to efficiently serve hundreds of processors at the
                 same time, requiring all basic data structures within
                 the runtime to maintain unprecedented throughput. In
                 this paper, we analyze the throughput requirements that
                 must be met by algorithms in runtime systems to be able
                 to handle hundreds of simultaneous operations in real
                 time. We reach a surprising conclusion: Many
                 traditional algorithm techniques are poorly suited for
                 highly parallel computing environments because of their
                 low throughput.",
  acknowledgement = ack-nhfb,
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stock:2012:UML,
  author =       "Kevin Stock and Louis-No{\"e}l Pouchet and P.
                 Sadayappan",
  title =        "Using machine learning to improve automatic
                 vectorization",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "50:1--50:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086729",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Automatic vectorization is critical to enhancing
                 performance of compute-intensive programs on modern
                 processors. However, there is much room for improvement
                 over the auto-vectorization capabilities of current
                 production compilers through careful vector-code
                 synthesis that utilizes a variety of loop
                 transformations (e.g., unroll-and-jam, interchange,
                 etc.) . As the set of transformations considered is
                 increased, the selection of the most effective
                 combination of transformations becomes a significant
                 challenge: Currently used cost models in vectorizing
                 compilers are often unable to identify the best
                 choices. In this paper, we address this problem using
                 machine learning models to predict the performance of
                 SIMD codes. In contrast to existing approaches that
                 have used high-level features of the program, we
                 develop machine learning models based on features
                 extracted from the generated assembly code.",
  acknowledgement = ack-nhfb,
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Therdsteerasukdi:2012:URI,
  author =       "Kanit Therdsteerasukdi and Gyungsu Byun and Jason Cong
                 and M. Frank Chang and Glenn Reinman",
  title =        "Utilizing {RF-I} and intelligent scheduling for better
                 throughput\slash watt in a mobile {GPU} memory system",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "51:1--51:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086730",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Smartphones and tablets are becoming more and more
                 powerful, replacing desktops and laptops as the users'
                 main computing system. As these systems support higher
                 and higher resolutions with more complex 3D graphics, a
                 high-throughput and low-power memory system is
                 essential for the mobile GPU. In this article, we
                 propose to improve throughput/watt in a mobile GPU
                 memory system by using intelligent scheduling to reduce
                 power and multi-band radio frequency interconnect
                 (MRF-I) to offset any throughput degradation caused by
                 our intelligent scheduling.",
  acknowledgement = ack-nhfb,
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ryckbosch:2012:VSM,
  author =       "Frederick Ryckbosch and Stijn Polfliet and Lieven
                 Eeckhout",
  title =        "{VSim}: Simulating multi-server setups at near native
                 hardware speed",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086731",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Simulating contemporary computer systems is a
                 challenging endeavor, especially when it comes to
                 simulating high-end setups involving multiple servers.
                 The simulation environment needs to run complete
                 software stacks, including operating systems,
                 middleware, and application software, and it needs to
                 simulate network and disk activity next to CPU
                 performance. In addition, it needs the ability to scale
                 out to a large number of server nodes while attaining
                 good accuracy and reasonable simulation speeds. This
                 paper presents VSim, a novel simulation methodology for
                 multi-server systems. VSim leverages virtualization
                 technology for simulating a target system on a host
                 system. VSim controls CPU, network and disk performance
                 on the host, and it gives the illusion to the software
                 stack to run on a target system through time
                 dilation.",
  acknowledgement = ack-nhfb,
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhou:2012:WAP,
  author =       "Miao Zhou and Yu Du and Bruce Childers and Rami Melhem
                 and Daniel Moss{\'e}",
  title =        "Writeback-aware partitioning and replacement for
                 last-level caches in phase change main memory systems",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "53:1--53:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086732",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Phase-Change Memory (PCM) has emerged as a promising
                 low-power main memory candidate to replace DRAM. The
                 main problems of PCM are that writes are much slower
                 and more power hungry than reads, write bandwidth is
                 much lower than read bandwidth, and limited write
                 endurance. Adding an extra layer of cache, which is
                 logically the last-level cache (LLC), can mitigate the
                 drawbacks of PCM. However, writebacks from the LLC
                 might (a) overwhelm the limited PCM write bandwidth and
                 stall the application, (b) shorten lifetime, and (c)
                 increase energy consumption. Cache partitioning and
                 replacement schemes are important to achieve high
                 throughput for multi-core systems.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2012:TMA,
  author =       "Qingping Wang and Sameer Kulkarni and John Cavazos and
                 Michael Spear",
  title =        "A transactional memory with automatic performance
                 tuning",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "54:1--54:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086733",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A significant obstacle to the acceptance of
                 transactional memory (TM) in real-world parallel
                 programs is the abundance of substantially different TM
                 algorithms. Each TM algorithm appears well-suited to
                 certain workload characteristics, but the best choice
                 of algorithm is sensitive to program inputs, available
                 cores, and program phases. Furthermore, operating
                 system and hardware characteristics can affect which
                 algorithm is best, with tradeoffs changing across
                 iterations of a single ISA. This paper introduces
                 methods for constructing policies to dynamically select
                 the most appropriate TM algorithm based on static and
                 dynamic information. We leverage intraprocedural static
                 analysis to create a static profile of the
                 application.",
  acknowledgement = ack-nhfb,
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bogdanski:2012:SFC,
  author =       "Bartosz Bogdanski and Sven-Arne Reinemo and Frank Olaf
                 Sem-Jacobsen and Ernst Gunnar Gran",
  title =        "{sFtree}: a fully connected and deadlock-free
                 switch-to-switch routing algorithm for fat-trees",
  journal =      j-TACO,
  volume =       "8",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2086696.2086734",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Sat Jan 21 07:49:49 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Existing fat-tree routing algorithms fully exploit the
                 path diversity of a fat-tree topology in the context of
                 compute node traffic, but they lack support for
                 deadlock-free and fully connected switch-to-switch
                 communication. Such support is crucial for efficient
                 system management, for example, in InfiniBand (IB)
                 systems. With the general increase in system management
                 capabilities found in modern InfiniBand switches, the
                 lack of deadlock-free switch-to-switch communication is
                 a problem for fat-tree-based IB installations because
                 management traffic might cause routing deadlocks that
                 bring the whole system down. This lack of deadlock-free
                 communication affects all system management and
                 diagnostic tools using LID routing. In this paper, we
                 propose the sFtree routing algorithm that guarantees
                 deadlock-free and fully connected switch-to-switch
                 communication in fat-trees while maintaining the
                 properties of the current fat-tree algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ghandour:2012:LSB,
  author =       "Walid J. Ghandour and Haitham Akkary and Wes Masri",
  title =        "Leveraging Strength-Based Dynamic Information Flow
                 Analysis to Enhance Data Value Prediction",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133382.2133383",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Value prediction is a technique to increase
                 parallelism by attempting to overcome serialization
                 constraints caused by true data dependences. By
                 predicting the outcome of an instruction before it
                 executes, value prediction allows data dependent
                 instructions to issue and execute speculatively, hence
                 increasing parallelism when the prediction is correct.
                 In case of a misprediction, the execution is redone
                 with the corrected value. If the benefit from increased
                 parallelism outweighs the misprediction recovery
                 penalty, overall performance could be improved.
                 Enhancing performance with value prediction therefore
                 requires highly accurate prediction methods. Most
                 existing general value prediction techniques are local,
                 that is, future outputs of an instruction are predicted
                 based on outputs from previous executions of the same
                 instruction. In this article, we investigate leveraging
                 strength-based dynamic information flow analysis to
                 enhance data value prediction. We use dynamic
                 information flow analysis (DIFA) to determine when a
                 specific value predictor can perform well and even
                 outperform other predictors. We apply information
                 theory to mathematically prove the validity and
                 benefits of correlating value predictors. We also
                 introduce the concept of the linear value predictors, a
                 new technique that predicts a new value from another
                 one using a linear relation. We finally present a
                 variant of stride predictor that we call update stride.
                 We then conduct an empirical analysis using Pin, a
                 dynamic binary instrumentation tool, and DynFlow, a
                 dynamic information flow analysis tool, that we apply
                 to programs from the SPECjvm2008 and Siemens
                 benchmarks. Our empirical measurements support our
                 mathematical theory and allow us to make important
                 observations on the relation between predictability of
                 data values and information flow. Our analysis and
                 empirical results show that the values of a set of
                 selected variables can be predicted with a very high
                 accuracy, up to 100\%. Such prediction is based on the
                 previous history and/or the values of one or more other
                 source variables that have strong information flow into
                 the predicted variable. Using our selection criteria,
                 we show that a DIFA-directed predictor outperforms
                 hardware value prediction for all subject programs, and
                 sometimes by a significant margin. This was observed
                 even when using an ideal tagged hardware value
                 prediction table that does not suffer from aliasing or
                 capacity misses.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2012:WPW,
  author =       "Jaekyu Lee and Hyesoon Kim and Richard Vuduc",
  title =        "When Prefetching Works, When It Doesn't, and Why",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133382.2133384",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In emerging and future high-end processor systems,
                 tolerating increasing cache miss latency and properly
                 managing memory bandwidth will be critical to achieving
                 high performance. Prefetching, in both hardware and
                 software, is among our most important available
                 techniques for doing so; yet, we claim that prefetching
                 is perhaps also the least well-understood. Thus, the
                 goal of this study is to develop a novel, foundational
                 understanding of both the benefits and limitations of
                 hardware and software prefetching. Our study includes:
                 source code-level analysis, to help in understanding
                 the practical strengths and weaknesses of compiler- and
                 software-based prefetching; a study of the synergistic
                 and antagonistic effects between software and hardware
                 prefetching; and an evaluation of hardware prefetching
                 training policies in the presence of software
                 prefetching requests. We use both simulation and
                 measurement on real systems. We find, for instance,
                 that although there are many opportunities for
                 compilers to prefetch much more aggressively than they
                 currently do, there is also a tangible risk of
                 interference with training existing hardware
                 prefetching mechanisms. Taken together, our
                 observations suggest new research directions for
                 cooperative hardware/software prefetching.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mazloom:2012:DTI,
  author =       "Bita Mazloom and Shashidhar Mysore and Mohit Tiwari
                 and Banit Agrawal and Tim Sherwood",
  title =        "Dataflow Tomography: Information Flow Tracking For
                 Understanding and Visualizing Full Systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133382.2133385",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "It is not uncommon for modern systems to be composed
                 of a variety of interacting services, running across
                 multiple machines in such a way that most developers do
                 not really understand the whole system. As abstraction
                 is layered atop abstraction, developers gain the
                 ability to compose systems of extraordinary complexity
                 with relative ease. However, many software properties,
                 especially those that cut across abstraction layers,
                 become very difficult to understand in such
                 compositions. The communication patterns involved, the
                 privacy of critical data, and the provenance of
                 information, can be difficult to find and understand,
                 even with access to all of the source code. The goal of
                 Dataflow Tomography is to use the inherent information
                 flow of such systems to help visualize the interactions
                 between complex and interwoven components across
                 multiple layers of abstraction. In the same way that
                 the injection of short-lived radioactive isotopes help
                 doctors trace problems in the cardiovascular system,
                 the use of ``data tagging'' can help developers slice
                 through the extraneous layers of software and pin-point
                 those portions of the system interacting with the data
                 of interest. To demonstrate the feasibility of this
                 approach we have developed a prototype system in which
                 tags are tracked both through the machine and in
                 between machines over the network, and from which novel
                 visualizations of the whole system can be derived. We
                 describe the system-level challenges in creating a
                 working system tomography tool and we qualitatively
                 evaluate our system by examining several example real
                 world scenarios.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ahn:2012:ISE,
  author =       "Jung Ho Ahn and Norman P. Jouppi and Christos
                 Kozyrakis and Jacob Leverich and Robert S. Schreiber",
  title =        "Improving System Energy Efficiency with Memory Rank
                 Subsetting",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133382.2133386",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "VLSI process technology scaling has enabled dramatic
                 improvements in the capacity and peak bandwidth of DRAM
                 devices. However, current standard DDR x DIMM memory
                 interfaces are not well tailored to achieve high energy
                 efficiency and performance in modern
                 chip-multiprocessor-based computer systems. Their
                 suboptimal performance and energy inefficiency can have
                 a significant impact on system-wide efficiency since
                 much of the system power dissipation is due to memory
                 power. New memory interfaces, better suited for future
                 many-core systems, are needed. In response, there are
                 recent proposals to enhance the energy efficiency of
                 main-memory systems by dividing a memory rank into
                 subsets, and making a subset rather than a whole rank
                 serve a memory request. We holistically assess the
                 effectiveness of rank subsetting from system-wide
                 performance, energy-efficiency, and reliability
                 perspectives. We identify the impact of rank subsetting
                 on memory power and processor performance analytically,
                 compare two promising rank-subsetting proposals,
                 Multicore DIMM and mini-rank, and verify our analysis
                 by simulating a chip-multiprocessor system using
                 multithreaded and consolidated workloads. We extend the
                 design of Multicore DIMM for high-reliability systems
                 and show that compared with conventional chipkill
                 approaches, rank subsetting can lead to much higher
                 system-level energy efficiency and performance at the
                 cost of additional DRAM devices. This holistic
                 assessment shows that rank subsetting offers compelling
                 alternatives to existing processor-memory interfaces
                 for future DDR systems.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yang:2012:CGC,
  author =       "Xuejun Yang and Li Wang and Jingling Xue and Qingbo
                 Wu",
  title =        "Comparability Graph Coloring for Optimizing
                 Utilization of Software-Managed Stream Register Files
                 for Stream Processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133382.2133387",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The stream processors represent a promising
                 alternative to traditional cache-based general-purpose
                 processors in achieving high performance in stream
                 applications (media and some scientific applications).
                 In a stream programming model for stream processors, an
                 application is decomposed into a sequence of kernels
                 operating on streams of data. During the execution of a
                 kernel on a stream processor, all streams accessed must
                 be communicated through a nonbypassing software-managed
                 on-chip memory, the SRF (Stream Register File).
                 Optimizing utilization of the scarce on-chip memory is
                 crucial for good performance. The key insight is that
                 the interference graphs (IGs) formed by the streams in
                 stream applications tend to be comparability graphs or
                 decomposable into a set of comparability graphs. We
                 present a compiler algorithm for finding optimal or
                 near-optimal colorings, that is, SRF allocations in
                 stream IGs, by computing a maximum spanning forest of
                 the sub-IG formed by long live ranges, if necessary.
                 Our experimental results validate the optimality and
                 near-optimality of our algorithm by comparing it with
                 an ILP solver, and show that our algorithm yields
                 improved SRF utilization over the First-Fit bin-packing
                 algorithm, the best in the literature.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Majumdar:2012:MPE,
  author =       "Abhinandan Majumdar and Srihari Cadambi and Michela
                 Becchi and Srimat T. Chakradhar and Hans Peter Graf",
  title =        "A Massively Parallel, Energy Efficient Programmable
                 Accelerator for Learning and Classification",
  journal =      j-TACO,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133382.2133388",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 30 17:45:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Applications that use learning and classification
                 algorithms operate on large amounts of unstructured
                 data, and have stringent performance constraints. For
                 such applications, the performance of general purpose
                 processors scales poorly with data size because of
                 their limited support for fine-grained parallelism and
                 absence of software-managed caches. The large
                 intermediate data in these applications also limits
                 achievable performance on many-core processors such as
                 GPUs. To accelerate such learning applications, we
                 present a programmable accelerator that can execute
                 multiple learning and classification algorithms. To
                 architect such an accelerator, we profile five
                 representative workloads, and find that their
                 computationally intensive portions can be formulated as
                 matrix or vector operations generating large amounts of
                 intermediate data, which are then reduced by a
                 secondary operation such as array ranking, finding
                 max/min and aggregation. Our proposed accelerator,
                 called MAPLE, has hundreds of simple processing
                 elements (PEs) laid out in a two-dimensional grid, with
                 two key features. First, it uses dynamic in-memory
                 processing where on-chip memory blocks perform the
                 secondary reduction operations. Second, MAPLE uses
                 banked off-chip memory, and organizes its PEs into
                 independent groups each with its own off-chip memory
                 bank. These two features allow MAPLE to scale its
                 performance with data size. We also present an Atom
                 based energy-efficient heterogeneous system with MAPLE
                 as the accelerator that satisfies the application's
                 performance requirements at a lower system power. This
                 article describes the MAPLE architecture, explores its
                 design space with a simulator, illustrates how to
                 automatically map application kernels to the hardware,
                 and presents its performance improvement and energy
                 benefits over classic server-based implementations. We
                 implement a 512-PE FPGA prototype of MAPLE and find
                 that it is 1.5-10x faster than a 2.5 GHz quad-core Xeon
                 processor despite running at a modest 125 MHz clock
                 rate. With MAPLE connected to a 1.6GHz dual-core Atom,
                 we show an energy improvement of 38--84\% over the Xeon
                 server coupled to a 1.3 GHz 240 core Tesla GPU.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Eyerman:2012:PMJ,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Probabilistic modeling for job symbiosis scheduling on
                 {SMT} processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2207222.2207223",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Symbiotic job scheduling improves simultaneous
                 multithreading (SMT) processor performance by
                 coscheduling jobs that have ``compatible'' demands on
                 the processor's shared resources. Existing approaches
                 however require a sampling phase, evaluate a limited
                 number of possible coschedules, use heuristics to gauge
                 symbiosis, are rigid in their optimization target, and
                 do not preserve system-level priorities/shares. This
                 article proposes probabilistic job symbiosis modeling,
                 which predicts whether jobs will create positive or
                 negative symbiosis when coscheduled without requiring
                 the coschedule to be evaluated. The model, which uses
                 per-thread cycle stacks computed through a previously
                 proposed cycle accounting architecture, is simple
                 enough to be used in system software. Probabilistic job
                 symbiosis modeling provides six key innovations over
                 prior work in symbiotic job scheduling: (i) it does not
                 require a sampling phase, (ii) it readjusts the job
                 coschedule continuously, (iii) it evaluates a large
                 number of possible coschedules at very low overhead,
                 (iv) it is not driven by heuristics, (v) it can
                 optimize a performance target of interest (e.g., system
                 throughput or job turnaround time), and (vi) it
                 preserves system-level priorities/shares. These
                 innovations make symbiotic job scheduling both
                 practical and effective. Our experimental evaluation,
                 which assumes a realistic scenario in which jobs come
                 and go, reports an average 16\% (and up to 35\%)
                 reduction in job turnaround time compared to the
                 previously proposed SOS (sample, optimize, symbios)
                 approach for a two-thread SMT processor, and an average
                 19\% (and up to 45\%) reduction in job turnaround time
                 for a four-thread SMT processor.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Seghir:2012:IAT,
  author =       "Rachid Seghir and Vincent Loechner and Beno{\^\i}t
                 Meister",
  title =        "Integer affine transformations of parametric
                 {$Z$}-polytopes and applications to loop nest
                 optimization",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2207222.2207224",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The polyhedral model is a well-known compiler
                 optimization framework for the analysis and
                 transformation of affine loop nests. We present a new
                 method to solve a difficult geometric operation that is
                 raised by this model: the integer affine transformation
                 of parametric $Z$-polytopes. The result of such a
                 transformation is given by a worst-case exponential
                 union of $Z$-polytopes. We also propose a polynomial
                 algorithm (for fixed dimension), to count points in
                 arbitrary unions of a fixed number of parametric
                 $Z$-polytopes. We implemented these algorithms and
                 compared them to other existing algorithms, for a set
                 of applications to loop nest analysis and
                 optimization.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yang:2012:UOC,
  author =       "Yi Yang and Ping Xiang and Jingfei Kong and Mike
                 Mantor and Huiyang Zhou",
  title =        "A unified optimizing compiler framework for different
                 {GPGPU} architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2207222.2207225",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents a novel optimizing compiler for
                 general purpose computation on graphics processing
                 units (GPGPU). It addresses two major challenges of
                 developing high performance GPGPU programs: effective
                 utilization of GPU memory hierarchy and judicious
                 management of parallelism. The input to our compiler is
                 a na{\"\i}ve GPU kernel function, which is functionally
                 correct but without any consideration for performance
                 optimization. The compiler generates two kernels, one
                 optimized for global memories and the other for texture
                 memories. The proposed compilation process is effective
                 for both AMD/ATI and NVIDIA GPUs. The experiments show
                 that our optimized code achieves very high performance,
                 either superior or very close to highly fine-tuned
                 libraries.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jang:2012:ACO,
  author =       "Choonki Jang and Jaejin Lee and Bernhard Egger and
                 Soojung Ryu",
  title =        "Automatic code overlay generation and partially
                 redundant code fetch elimination",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2207222.2207226",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "There is an increasing interest in explicitly managed
                 memory hierarchies, where a hierarchy of distinct
                 memories is exposed to the programmer and managed
                 explicitly in software. These hierarchies can be found
                 in typical embedded systems and an emerging class of
                 multicore architectures. To run an application that
                 requires more code memory than the available
                 higher-level memory, typically an overlay structure is
                 needed. The overlay structure is generated manually by
                 the programmer or automatically by a specialized
                 linker. Manual code overlaying requires the programmer
                 to deeply understand the program structure for maximum
                 memory savings as well as minimum performance
                 degradation. Although the linker can automatically
                 generate the code overlay structure, its memory savings
                 are limited and it even brings significant performance
                 degradation because traditional techniques do not
                 consider the program context. In this article, we
                 propose an automatic code overlay generation technique
                 that overcomes the limitations of traditional automatic
                 code overlaying techniques. We are dealing with a
                 system context that imposes two distinct constraints:
                 (1) no hardware support for address translation and (2)
                 a spatially and temporally coarse grained faulting
                 mechanism at the function level. Our approach addresses
                 those two constraints as efficiently as possible. Our
                 technique statically computes the Worst-Case Number of
                 Conflict misses (WCNC) between two different code
                 segments using path expressions. Then, it constructs a
                 static temporal relationship graph with the WCNCs and
                 emits an overlay structure for a given higher-level
                 memory size. We also propose an inter-procedural
                 partial redundancy elimination technique that minimizes
                 redundant code copying caused by the generated overlay
                 structure. Experimental results show that our approach
                 is promising.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Abbasi:2012:TSW,
  author =       "Zahra Abbasi and Georgios Varsamopoulos and Sandeep K.
                 S. Gupta",
  title =        "{TACOMA}: Server and workload management in {Internet}
                 data centers considering cooling-computing power
                 trade-off and energy proportionality",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2207222.2207227",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A two-tier Internet data center management scheme,
                 TACOMA, with thermal-aware server provisioning (TASP)
                 in one tier, and thermal-aware workload distribution
                 (TAWD) in the other is proposed. TASP and TAWD
                 coordinate to maximize the energy savings by leveraging
                 the workload dynamics, at coarse and fine time scale,
                 respectively. TACOMA is aware of the QoS constraints,
                 the energy proportionality of servers, and the
                 potential trade-off between cooling and computing
                 power. The obtained energy savings are a combination of
                 suspending idle servers, using servers at their peak
                 efficiency, and avoiding heat recirculation.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lankes:2012:BSP,
  author =       "Andreas Lankes and Thomas Wild and Stefan Wallentowitz
                 and Andreas Herkersdorf",
  title =        "Benefits of selective packet discard in
                 networks-on-chip",
  journal =      j-TACO,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2207222.2207228",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed Jun 13 17:20:51 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Today, Network on Chip concepts principally assume
                 inherent lossless operation. Considering that future
                 nanometer CMOS technologies will witness increased
                 sensitivity to all forms of manufacturing and
                 environmental variations (e.g., IR drop, soft errors
                 due to radiation, transient temperature induced timing
                 problems, device aging), efforts to cope with data
                 corruption or packet loss will be unavoidable. Possible
                 counter measures against packet loss are the extension
                 of flits with ECC or the introduction of error
                 detection with retransmission. We propose to make use
                 of the perceived deficiency of packet loss as a
                 feature. By selectively discarding stuck packets in the
                 NoC, a proven practice in computer networks, all types
                 of deadlocks can be resolved. This is especially
                 advantageous for solving the problem of
                 message-dependent deadlocks, which otherwise leads to
                 high costs either in terms of throughput or chip area.
                 Strict ordering, the most popular approach to this
                 problem, results in a significant buffer overhead and a
                 more complex router architecture. In addition, we will
                 show that eliminating local network congestions by
                 selectively discarding individual packets also can
                 improve the effective throughput of the network. The
                 end-to-end retransmission mechanism required for the
                 reliable communication, then also provides lossless
                 communication for the cores.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luo:2012:DDS,
  author =       "Yangchun Luo and Antonia Zhai",
  title =        "Dynamically dispatching speculative threads to improve
                 sequential execution",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355586",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Efficiently utilizing multicore processors to improve
                 their performance potentials demands extracting
                 thread-level parallelism from the applications. Various
                 novel and sophisticated execution models have been
                 proposed to extract thread-level parallelism from
                 sequential programs. One such execution model,
                 Thread-Level Speculation (TLS), allows potentially
                 dependent threads to execute speculatively in parallel.
                 However, TLS execution is inherently unpredictable, and
                 consequently incorrect speculation could degrade
                 performance for the multicore systems. Existing
                 approaches have focused on using the compilers to
                 select sequential program regions to apply TLS. Our
                 research shows that even the state-of-the-art compiler
                 makes suboptimal decisions, due to the unpredictability
                 of TLS execution. Thus, we propose to dynamically
                 optimize TLS performance. This article describes the
                 design, implementation, and evaluation of a runtime
                 thread dispatching mechanism that adjusts the behaviors
                 of speculative threads based on their efficiency. In
                 the proposed system, speculative threads are monitored
                 by hardware-based performance counters and their
                 performance impact is evaluated with a novel
                 methodology that takes into account various unique TLS
                 characteristics. Thread dispatching policies are
                 devised to adjust the behaviors of speculative threads
                 accordingly. With the help of the runtime evaluation,
                 where and how to create speculative threads is better
                 determined. Evaluated with all the SPEC CPU2000
                 benchmark programs written in C, the dynamic
                 dispatching system outperforms the state-of-the-art
                 compiler-based thread management techniques by 9.4\% on
                 average. Comparing to sequential execution, we achieve
                 1.37X performance improvement on a four-core CMP-based
                 system.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cui:2012:EPO,
  author =       "Huimin Cui and Jingling Xue and Lei Wang and Yang Yang
                 and Xiaobing Feng and Dongrui Fan",
  title =        "Extendable pattern-oriented optimization directives",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355587",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Algorithm-specific, that is, semantic-specific
                 optimizations have been observed to bring significant
                 performance gains, especially for a diverse set of
                 multi/many-core architectures. However, current
                 programming models and compiler technologies for the
                 state-of-the-art architectures do not exploit well
                 these performance opportunities. In this article, we
                 propose a pattern-making methodology that enables
                 algorithm-specific optimizations to be encapsulated
                 into ``optimization patterns''. Such optimization
                 patterns are expressed in terms of preprocessor
                 directives so that simple annotations can result in
                 significant performance improvements. To validate this
                 new methodology, a framework, named EPOD, is developed
                 to map these directives into the underlying
                 optimization schemes for a particular architecture. It
                 is difficult to create an exact performance model to
                 determine an optimal or near-optimal optimization
                 scheme (including which optimizations to apply and in
                 which order) for a specific application, due to the
                 complexity of applications and architectures. However,
                 it is trackable to build individual optimization
                 components and let compiler developers synthesize an
                 optimization scheme from these components. Therefore,
                 our EPOD framework provides an Optimization Programming
                 Interface (OPI) for compiler developers to define new
                 optimization schemes. Thus, new patterns can be
                 integrated into EPOD in a flexible manner. We have
                 identified and implemented a number of optimization
                 patterns for three representative computer platforms.
                 Our experimental results show that a pattern-guided
                 compiler can outperform the state-of-the-art compilers
                 and even achieve performance as competitive as
                 hand-tuned code. Therefore, such a pattern-making
                 methodology represents an encouraging direction for
                 domain experts' experience and knowledge to be
                 integrated into general-purpose compilers.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lewis:2012:REC,
  author =       "Adam Wade Lewis and Nian-Feng Tzeng and Soumik
                 Ghosh",
  title =        "Runtime energy consumption estimation for server
                 workloads based on chaotic time-series approximation",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355588",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes a runtime model that relates
                 server energy consumption to its overall thermal
                 envelope, using hardware performance counters and
                 experimental measurements. While previous studies have
                 attempted system-wide modeling of server power
                 consumption through subsystem models, our approach is
                 different in that it links system energy input to
                 subsystem energy consumption based on a small set of
                 tightly correlated parameters. The proposed model takes
                 into account processor power, bus activities, and
                 system ambient temperature for real-time prediction on
                 the power consumption of long running jobs. Using the
                 HyperTransport and QuickPath Link structures as case
                 studies and through electrical measurements on example
                 server subsystems, we develop a chaotic time-series
                 approximation for runtime power consumption, arriving
                 at the Chaotic Attractor Predictor (CAP). With
                 polynomial time complexity, CAP exhibits high
                 prediction accuracy, having the prediction errors
                 within 1.6\% (or 3.3\%) for servers based on the
                 HyperTransport bus (or the QuickPath Links), as
                 verified by a set of common processor benchmarks. Our
                 CAP is a superior predictive mechanism over existing
                 linear auto-regressive methods, which require expensive
                 and complex corrective steps to address the nonlinear
                 and chaotic aspects of the underlying physical
                 system.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Valero:2012:CRI,
  author =       "Alejandro Valero and Julio Sahuquillo and Salvador
                 Petit and Pedro L{\'o}pez and Jos{\'e} Duato",
  title =        "Combining recency of information with selective random
                 and a victim cache in last-level caches",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355589",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory latency has become an important performance
                 bottleneck in current microprocessors. This problem
                 aggravates as the number of cores sharing the same
                 memory controller increases. To palliate this problem,
                 a common solution is to implement cache hierarchies
                 with large or huge Last-Level Cache (LLC)
                 organizations. LLC memories are implemented with a high
                 number of ways (e.g., 16) to reduce conflict misses.
                 Typically, caches have implemented the LRU algorithm to
                 exploit temporal locality, but its performance goes
                 away from the optimal as the number of ways increases.
                 In addition, the implementation of a strict LRU
                 algorithm is costly in terms of area and power. This
                 article focuses on a family of low-cost replacement
                 strategies, whose implementation scales with the number
                 of ways while maintaining the performance. The proposed
                 strategies track the accessing order for just a few
                 blocks, which cannot be replaced. The victim is
                 randomly selected among those blocks exhibiting poor
                 locality. Although, in general, the random policy helps
                 improving the performance, in some applications the
                 scheme fails with respect to the LRU policy leading to
                 performance degradation. This drawback can be overcome
                 by the addition of a small victim cache of the large
                 LLC. Experimental results show that, using the best
                 version of the family without victim cache, MPKI
                 reduction falls in between 10\% and 11\% compared to a
                 set of the most representative state-of-the-art
                 algorithms, whereas the reduction grows up to 22\% with
                 respect to LRU. The proposal with victim cache achieves
                 speedup improvements, on average, by 4\% compared to
                 LRU. In addition, it reduces dynamic energy, on
                 average, up to 8\%. Finally, compared to the studied
                 algorithms, hardware complexity is largely reduced by
                 the baseline algorithm of the family.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2012:DQM,
  author =       "Bin Li and Li-Shiuan Peh and Li Zhao and Ravi Iyer",
  title =        "Dynamic {QoS} management for chip multiprocessors",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355590",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the continuing scaling of semiconductor
                 technologies, chip multiprocessor (CMP) has become the
                 de facto design for modern high performance computer
                 architectures. It is expected that more and more
                 applications with diverse requirements will run
                 simultaneously on the CMP platform. However, this will
                 exert contention on shared resources such as the last
                 level cache, network-on-chip bandwidth and off-chip
                 memory bandwidth, thus affecting the performance and
                 quality-of-service (QoS) significantly. In this
                 environment, efficient resource sharing and a guarantee
                 of a certain level of performance is highly desirable.
                 Researchers have proposed different frameworks for
                 providing QoS. Most of these frameworks focus on
                 individual resource for QoS management. Coordinated
                 management of multiple QoS-aware shared resources at
                 runtime remains an open problem. Recently, there has
                 been work that proposed a class-of-serviced based
                 framework to jointly managing cache, NoC and memory
                 resources simultaneously. However, the work allocates
                 shared resources statically at the beginning of
                 application runtime, and do not dynamically track,
                 manage and share shared resources across applications.
                 In this article, we address this limitation by
                 proposing dynamic resource management policies that
                 monitor the resource usage of applications at runtime,
                 then steals resources from the high-priority
                 applications for lower-priority ones. The goal is to
                 maintain the targeted level of performance for
                 high-priority applications while improving the
                 performance of lower-priority applications. We use a PI
                 (Proportional-Integral gain) feedback controller based
                 technique to maintain stability in our framework. Our
                 evaluation results show that our policy can improve
                 performance for lower-priority applications
                 significantly while maintaining the performance for
                 high-priority application, thus demonstrating the
                 effectiveness of our dynamic QoS resource management
                 policy.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xekalakis:2012:MSM,
  author =       "Polychronis Xekalakis and Nikolas Ioannou and Marcelo
                 Cintra",
  title =        "Mixed speculative multithreaded execution models",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355591",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The current trend toward multicore architectures has
                 placed great pressure on programmers and compilers to
                 generate thread-parallel programs. Improved execution
                 performance can no longer be obtained via traditional
                 single-thread instruction level parallelism (ILP), but,
                 instead, via multithreaded execution. One notable
                 technique that facilitates the extraction of parallel
                 threads from sequential applications is thread-level
                 speculation (TLS). This technique allows
                 programmers/compilers to generate threads without
                 checking for inter-thread data and control dependences,
                 which are then transparently enforced by the hardware.
                 Most prior work on TLS has concentrated on thread
                 selection and mechanisms to efficiently support the
                 main TLS operations, such as squashes, data versioning,
                 and commits. This article seeks to enhance TLS
                 functionality by combining it with other speculative
                 multithreaded execution models. The main idea is that
                 TLS already requires extensive hardware support, which
                 when slightly augmented can accommodate other
                 speculative multithreaded techniques. Recognizing that
                 for different applications, or even program phases, the
                 application bottlenecks may be different, it is
                 reasonable to assume that the more versatile a system
                 is, the more efficiently it will be able to execute the
                 given program. Toward this direction, we first show
                 that mixed execution models that combine TLS with
                 Helper Threads (HT), RunAhead execution (RA) and
                 MultiPath execution (MP) perform better than any of the
                 models alone. Based on a simple model that we propose,
                 we show that benefits come from being able to extract
                 additional ILP without harming the TLP extracted by
                 TLS. We then show that by combining all the execution
                 models in a unified one that combines all these
                 speculative multithreaded models, ILP can be further
                 enhanced with only minimal additional cost in
                 hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sharafeddine:2012:DOE,
  author =       "Mageda Sharafeddine and Komal Jothi and Haitham
                 Akkary",
  title =        "Disjoint out-of-order execution processor",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355592",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-performance superscalar architectures used to
                 exploit instruction level parallelism in single-thread
                 applications have become too complex and power hungry
                 for the multicore processors era. We propose a new
                 architecture that uses multiple small latency-tolerant
                 out-of-order cores to improve single-thread
                 performance. Improving single-thread performance with
                 multiple small out-of-order cores allows designers to
                 place more of these cores on the same die.
                 Consequently, emerging highly parallel applications can
                 take full advantage of the multicore parallel hardware
                 without sacrificing performance of inherently serial
                 and hard to parallelize applications. Our architecture
                 combines speculative multithreading (SpMT) with
                 checkpoint recovery and continual flow pipeline
                 architectures. It splits single-thread program
                 execution into disjoint control and data threads that
                 execute concurrently on multiple cooperating small and
                 latency-tolerant out-of-order cores. Hence we call this
                 style of execution Disjoint Out-of-Order Execution
                 (DOE). DOE uses latency tolerance to overcome
                 performance issues of SpMT caused by interthread data
                 dependences. To evaluate this architecture, we have
                 developed a microarchitecture performance model of DOE
                 based on PTLSim, a simulation infrastructure of the x86
                 instruction set architecture. We evaluate the potential
                 performance of DOE processor architecture using a
                 simple heuristic to fork control independent threads in
                 hardware at the target addresses of future procedure
                 return instructions. Using applications from SpecInt
                 2000, we study DOE under ideal as well as realistic
                 architectural constraints. We discuss the performance
                 impact of key DOE architecture and application
                 variables such as number of cores, interthread data
                 dependences, intercore data communication delay,
                 buffers capacity, and branch mispredictions. Without
                 any DOE specific compiler optimizations, our results
                 show that DOE outperforms conventional SpMT
                 architectures by 15\%, on average. We also show that
                 DOE with four small cores can perform on average
                 equally well to a large superscalar core, consuming
                 about the same power. Most importantly, DOE improves
                 throughput performance by a significant amount over a
                 large superscalar core, up to 2.5 times, when running
                 multitasking applications.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Andrade:2012:SAW,
  author =       "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
                 Doallo",
  title =        "Static analysis of the worst-case memory performance
                 for irregular codes with indirections",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355593",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Real-time systems are subject to timing constraints,
                 whose upper bound is given by the Worst-Case Execution
                 Time (WCET). Cache memory behavior is difficult to
                 predict analytically and estimating a safe and precise
                 worst-case value is even more challenging. The
                 worst-case memory performance (WCMP) component of the
                 WCET can only be estimated with the precise knowledge
                 of the stream of data addresses accessed by the code,
                 which is determined by the access patterns and the base
                 addresses of the data structures accessed. The
                 regularity of strided access patterns simplifies their
                 analysis, as they are characterized by relatively few
                 parameters, which are often available at compile time.
                 Unfortunately codes may exhibit irregular access
                 patterns, which are much more difficult to statically
                 analyze. As for the base addresses of the data
                 structures, they are not always available at
                 compile-time for many reasons: stack variables,
                 dynamically allocated memory, modules compiled
                 separately, etc. This article addresses these problems
                 by presenting a model that predicts an \%safe and upper
                 bound of the data cache performance for codes both with
                 regular and irregular access patterns, which is valid
                 for any possible base addresses of the data structures.
                 The model analyzes irregular access patterns due to the
                 presence of indirections in the code and it can provide
                 two kinds of predictions: a safe hard boundary that is
                 suitable for hard real-time systems and a soft boundary
                 whose safeness is not guaranteed but which is valid
                 most of the times. In fact, in all our experiments the
                 number of misses was below the soft boundary predicted
                 by the model. This turns this soft boundary prediction
                 into a valuable tool, particularly for non and soft
                 real-time systems, which tolerate a percentage of the
                 runs exceeding their deadlines.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2012:DIO,
  author =       "Yang Chen and Shuangde Fang and Yuanjie Huang and
                 Lieven Eeckhout and Grigori Fursin and Olivier Temam
                 and Chengyong Wu",
  title =        "Deconstructing iterative optimization",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355594",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Iterative optimization is a popular compiler
                 optimization approach that has been studied extensively
                 over the past decade. In this article, we deconstruct
                 iterative optimization by evaluating whether it works
                 across datasets and by analyzing why it works. Up to
                 now, most iterative optimization studies are based on a
                 premise which was never truly evaluated: that it is
                 possible to learn the best compiler optimizations
                 across datasets. In this article, we evaluate this
                 question for the first time with a very large number of
                 datasets. We therefore compose KDataSets, a dataset
                 suite with 1000 datasets for 32 programs, which we
                 release to the public. We characterize the diversity of
                 KDataSets, and subsequently use it to evaluate
                 iterative optimization. For all 32 programs, we find
                 that there exists at least one combination of compiler
                 optimizations that achieves at least 83\% or more of
                 the best possible speedup across all datasets on two
                 widely used compilers (Intel's ICC and GNU's GCC). This
                 optimal combination is program-specific and yields
                 speedups up to 3.75$\times$ (averaged across datasets
                 of a program) over the highest optimization level of
                 the compilers (-O3 for GCC and -fast for ICC). This
                 finding suggests that optimizing programs across
                 datasets might be much easier than previously
                 anticipated. In addition, we evaluate the idea of
                 introducing compiler choice as part of iterative
                 optimization. We find that it can further improve the
                 performance of iterative optimization because different
                 programs favor different compilers. We also investigate
                 why iterative optimization works by analyzing the
                 optimal combinations. We find that only a handful
                 optimizations yield most of the speedup. Finally, we
                 show that optimizations interact in a complex and
                 sometimes counterintuitive way through two case
                 studies, which confirms that iterative optimization is
                 an irreplaceable and important compiler strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Guha:2012:MOD,
  author =       "Apala Guha and Kim Hazelwood and Mary Lou Soffa",
  title =        "Memory optimization of dynamic binary translators for
                 embedded systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355595",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic binary translators (DBTs) are becoming
                 increasingly important because of their power and
                 flexibility. DBT-based services are valuable for all
                 types of platforms. However, the high memory demands of
                 DBTs present an obstacle for embedded systems. Most
                 research on DBT design has a performance focus, which
                 often drives up the DBT memory demand. In this article,
                 we present a memory-oriented approach to DBT design. We
                 consider the class of translation-based DBTs and their
                 sources of memory demand; cached translated code,
                 cached auxiliary code and DBT data structures. We
                 explore aspects of DBT design that impact these memory
                 demand sources and present strategies to mitigate
                 memory demand. We also explore performance
                 optimizations for DBTs that handle memory demand by
                 placing a limit on it, and repeatedly flush
                 translations to stay within the limit, thereby
                 replacing the memory demand problem with a performance
                 degradation problem. Our optimizations that mitigate
                 memory demand improve performance.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Geraci:2012:TFP,
  author =       "James R. Geraci and Sharon M. Sacco",
  title =        "A transpose-free in-place {SIMD} optimized {FFT}",
  journal =      j-TACO,
  volume =       "9",
  number =       "3",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2355585.2355596",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 22 10:48:53 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A transpose-free in-place SIMD optimized algorithm for
                 the computation of large FFTs is introduced and
                 implemented on the Cell Broadband Engine. Six different
                 FFT implementations of the algorithm using six
                 different data movement methods are described. Their
                 relative performance is compared for input sizes from
                 $2^{17}$ to $2^{21}$ complex floating point samples.
                 Large differences in performance are observed among
                 even theoretically equivalent data movement patterns.
                 All six implementations compare favorably with FFTW and
                 other previous FFT implementations.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Coppens:2013:FDB,
  author =       "Bart Coppens and Bjorn {De Sutter} and Jonas Maebe",
  title =        "Feedback-driven binary code diversification to the
                 special issue on high-performance embedded
                 architectures and compilers",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400683",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As described in many blog posts and in the scientific
                 literature, exploits for software vulnerabilities are
                 often engineered on the basis of patches. For example,
                 ``Microsoft Patch Tuesday'' is often followed by
                 ``Exploit Wednesday'' during which yet unpatched
                 systems become vulnerable to patch-based exploits. Part
                 of the patch engineering includes the identification of
                 the vulnerable binary code by means of
                 reverse-engineering tools and diffing add-ons. In this
                 article we present a feedback-driven compiler tool flow
                 that iteratively transforms code until diffing tools
                 become ineffective enough to close the ``Exploit
                 Wednesday'' window of opportunity. We demonstrate the
                 tool's effectiveness on a set of real-world patches and
                 against the latest version of BinDiff.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fowers:2013:PEC,
  author =       "Jeremy Fowers and Greg Brown and John Wernsing and
                 Greg Stitt",
  title =        "A performance and energy comparison of convolution on
                 {GPUs}, {FPGAs}, and multicore processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "25:1--25:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400684",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent architectural trends have focused on increased
                 parallelism via multicore processors and increased
                 heterogeneity via accelerator devices (e.g.,
                 graphics-processing units, field-programmable gate
                 arrays). Although these architectures have significant
                 performance and energy potential, application designers
                 face many device-specific challenges when choosing an
                 appropriate accelerator or when customizing an
                 algorithm for an accelerator. To help address this
                 problem, in this article we thoroughly evaluate
                 convolution, one of the most common operations in
                 digital-signal processing, on multicores,
                 graphics-processing units, and field-programmable gate
                 arrays. Whereas many previous application studies
                 evaluate a specific usage of an application, this
                 article assists designers with design space exploration
                 for numerous use cases by analyzing effects of
                 different input sizes, different algorithms, and
                 different devices, while also determining
                 Pareto-optimal trade-offs between performance and
                 energy.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rohou:2013:VTI,
  author =       "Erven Rohou and Kevin Williams and David Yuste",
  title =        "Vectorization technology to improve interpreter
                 performance",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400685",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the present computing landscape, interpreters are
                 in use in a wide range of systems. Recent trends in
                 consumer electronics have created a new category of
                 portable, lightweight software applications. Typically,
                 these applications have fast development cycles and
                 short life spans. They run on a wide range of systems
                 and are deployed in a target independent bytecode
                 format over Internet and cellular networks. Their
                 authors are untrusted third-party vendors, and they are
                 executed in secure managed runtimes or virtual
                 machines. Furthermore, due to security policies or
                 development time constraints, these virtual machines
                 often lack just-in-time compilers and rely on
                 interpreted execution. At the other end of the
                 spectrum, interpreters are also a reality in the field
                 of high-performance computations because of the
                 flexibility they provide. The main performance penalty
                 in interpreters arises from instruction dispatch. Each
                 bytecode requires a minimum number of machine
                 instructions to be executed. In this work, we introduce
                 a novel approach for interpreter optimization that
                 reduces instruction dispatch thanks to vectorization
                 technology. We extend the split compilation paradigm to
                 interpreters, thus guaranteeing that our approach
                 exhibits almost no overhead at runtime. We take
                 advantage of the vast research in vectorization and its
                 presence in modern compilers. Complex analyses are
                 performed ahead of time, and their results are conveyed
                 to the executable bytecode. At runtime, the interpreter
                 retrieves this additional information to build the SIMD
                 IR (intermediate representation) instructions that
                 carry the vector semantics. The bytecode language
                 remains unmodified, making this representation
                 compatible with legacy interpreters and previously
                 proposed JIT compilers. We show that this approach
                 drastically reduces the number of instructions to
                 interpret and decreases execution time of vectorizable
                 applications. Moreover, we map SIMD IR instructions to
                 hardware SIMD instructions when available, with a
                 substantial additional improvement. Finally, we finely
                 analyze the impact of our extension on the behavior of
                 the caches and branch predictors.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cleary:2013:FAT,
  author =       "Jimmy Cleary and Owen Callanan and Mark Purcell and
                 David Gregg",
  title =        "Fast asymmetric thread synchronization",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "27:1--27:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400686",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "For most multi-threaded applications, data structures
                 must be shared between threads. Ensuring thread safety
                 on these data structures incurs overhead in the form of
                 locking and other synchronization mechanisms. Where
                 data is shared among multiple threads these costs are
                 unavoidable. However, a common access pattern is that
                 data is accessed primarily by one dominant thread, and
                 only very rarely by the other, non-dominant threads.
                 Previous research has proposed biased locks, which are
                 optimized for a single dominant thread, at the cost of
                 greater overheads for non-dominant threads. In this
                 article we propose a new family of biased
                 synchronization mechanisms that, using a modified
                 interface, push accesses to shared data from the
                 non-dominant threads to the dominant one, via a novel
                 set of message passing mechanisms. We present
                 mechanisms for protecting critical sections, for
                 queueing work, for caching shared data in registers
                 where it is safe to do so, and for asynchronous
                 critical section accesses. We present results for the
                 conventional Intel\reg{} Sandy Bridge processor and for
                 the emerging network-optimized many-core IBM\reg{}
                 PowerENTM processor. We find that our algorithms
                 compete well with existing biased locking algorithms,
                 and, in particular, perform better than existing
                 algorithms as accesses from non-dominant threads
                 increase.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2013:PTL,
  author =       "Yong Li and Rami Melhem and Alex K. Jones",
  title =        "{PS-TLB}: Leveraging page classification information
                 for fast, scalable and efficient translation for future
                 {CMPs}",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "28:1--28:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400687",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Traversing the page table during virtual to physical
                 address translation causes pipeline stalls when misses
                 occur in the translation-lookaside buffer (TLB).
                 State-of-the-art translation proposals typically
                 optimize a single aspect of translation performance
                 (e.g., translation sharing, context switch performance,
                 etc.) with potential trade-offs of additional hardware
                 complexity, increased translation latency, or reduced
                 scalability. In this article, we propose the partial
                 sharing TLB (PS-TLB), a fast and scalable solution that
                 reduces off-chip translation misses without sacrificing
                 the timing-critical requirement of on-chip translation.
                 We introduce the partial sharing buffer (PSB) which
                 leverages application page sharing characteristics
                 using minimal additional hardware resources. Compared
                 to the leading TLB proposal that leverages sharing,
                 PS-TLB provides a more than 45\% improvement in
                 translation latency with a 9\% application speedup
                 while using fewer storage resources. In addition, the
                 page classification and PS-TLB architecture provide
                 further optimizations including an over 30\% reduction
                 of interprocessor interrupts for coherence, and reduced
                 context switch misses with fewer resources compared
                 with existing methods.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{DuBois:2013:PTC,
  author =       "Kristof {Du Bois} and Stijn Eyerman and Lieven
                 Eeckhout",
  title =        "Per-thread cycle accounting in multicore processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400688",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "While multicore processors improve overall chip
                 throughput and hardware utilization, resource sharing
                 among the cores leads to unpredictable performance for
                 the individual threads running on a multicore
                 processor. Unpredictable per-thread performance becomes
                 a problem when considered in the context of multicore
                 scheduling: system software assumes that all threads
                 make equal progress, however, this is not what the
                 hardware provides. This may lead to problems at the
                 system level such as missed deadlines, reduced
                 quality-of-service, non-satisfied service-level
                 agreements, unbalanced parallel performance, priority
                 inversion, unpredictable interactive performance, etc.
                 This article proposes a hardware-efficient per-thread
                 cycle accounting architecture for multicore processors.
                 The counter architecture tracks per-thread progress in
                 a multicore processor, detects how inter-thread
                 interference affects per-thread performance, and
                 predicts the execution time for each thread if run in
                 isolation. The counter architecture captures the
                 effects of additional conflict misses due to cache
                 sharing as well as increased latency for other memory
                 accesses due to resource and bandwidth contention in
                 the memory subsystem. The proposed method accounts for
                 74.3\% of the interference cycles, and estimates
                 per-thread progress within 14.2\% on average across a
                 large set of multi-program workloads. Hardware cost is
                 limited to 7.44KB for an 8-core processor, a reduction
                 by almost $10 \times$ compared to prior work while
                 being 63.8\% more accurate. Making system software
                 progress aware improves fairness by 22.5\% on average
                 over progress-agnostic scheduling.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wimmer:2013:MAV,
  author =       "Christian Wimmer and Michael Haupt and Michael L. {Van
                 De Vanter} and Mick Jordan and Laurent Dayn{\`e}s and
                 Douglas Simon",
  title =        "{Maxine}: an approachable virtual machine for, and in,
                 {Java}",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "30:1--30:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400689",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A highly productive platform accelerates the
                 production of research results. The design of a Virtual
                 Machine (VM) written in the Java{\TM} programming
                 language can be simplified through exploitation of
                 interfaces, type and memory safety, automated memory
                 management (garbage collection), exception handling,
                 and reflection. Moreover, modern Java IDEs offer
                 time-saving features such as refactoring,
                 auto-completion, and code navigation. Finally, Java
                 annotations enable compiler extensions for low-level
                 ``systems programming'' while retaining IDE
                 compatibility. These techniques collectively make
                 complex system software more ``approachable'' than has
                 been typical in the past. The Maxine VM, a metacircular
                 Java VM implementation, has aggressively used these
                 features since its inception. A co-designed companion
                 tool, the Maxine Inspector, offers integrated debugging
                 and visualization of all aspects of the VM's runtime
                 state. The Inspector's implementation exploits advanced
                 Java language features, embodies intimate knowledge of
                 the VM's design, and even reuses a significant amount
                 of VM code directly. These characteristics make Maxine
                 a highly approachable VM research platform and a
                 productive basis for research and teaching.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Khan:2013:SBA,
  author =       "Malik Khan and Protonu Basu and Gabe Rudy and Mary
                 Hall and Chun Chen and Jacqueline Chame",
  title =        "A script-based autotuning compiler system to generate
                 high-performance {CUDA} code",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "31:1--31:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400690",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents a novel compiler framework for
                 CUDA code generation. The compiler structure is
                 designed to support autotuning, which employs empirical
                 techniques to evaluate a set of alternative mappings of
                 computation kernels and select the mapping that obtains
                 the best performance. This article introduces a
                 Transformation Strategy Generator, a meta-optimizer
                 that generates a set of transformation recipes, which
                 are descriptions of the mapping of the sequential code
                 to parallel CUDA code. These recipes comprise a search
                 space of possible implementations. This system achieves
                 performance comparable and sometimes better than
                 manually tuned libraries and exceeds the performance of
                 a state-of-the-art GPU compiler.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{VanCraeynest:2013:UFD,
  author =       "Kenzo {Van Craeynest} and Lieven Eeckhout",
  title =        "Understanding fundamental design choices in
                 single-{ISA} heterogeneous multicore architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "32:1--32:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400691",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single-ISA heterogeneous multicore processors have
                 gained substantial interest over the past few years
                 because of their power efficiency, as they offer the
                 potential for high overall chip throughput within a
                 given power budget. Prior work in heterogeneous
                 architectures has mainly focused on how heterogeneity
                 can improve overall system throughput. To what extent
                 heterogeneity affects per-program performance has
                 remained largely unanswered. In this article, we aim at
                 understanding how heterogeneity affects both chip
                 throughput and per-program performance; how
                 heterogeneous architectures compare to homogeneous
                 architectures under both performance metrics; and how
                 fundamental design choices, such as core type, cache
                 size, and off-chip bandwidth, affect performance. We
                 use analytical modeling to explore a large space of
                 single-ISA heterogeneous architectures. The analytical
                 model has linear-time complexity in the number of core
                 types and programs of interest, and offers a unique
                 opportunity for exploring the large space of both
                 homogeneous and heterogeneous multicore processors in
                 limited time. Our analysis provides several interesting
                 insights: While it is true that heterogeneity can
                 improve system throughput, it fundamentally trades
                 per-program performance for chip throughput; although
                 some heterogeneous configurations yield better
                 throughput and per-program performance than homogeneous
                 designs, some homogeneous configurations are optimal
                 for particular throughput versus per-program
                 performance trade-offs. Two core types provide most of
                 the benefits from heterogeneity and a larger number of
                 core types does not contribute much; job-to-core
                 mapping is both important and challenging for
                 heterogeneous multicore processors to achieve optimum
                 performance. Limited off-chip bandwidth does alter some
                 of the fundamental design choices in heterogeneous
                 multicore architectures, such as the need for large
                 on-chip caches for achieving high throughput, and
                 per-program performance degrading more relative to
                 throughput under constrained off-chip bandwidth.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Antao:2013:CFA,
  author =       "Samuel Ant{\~a}o and Leonel Sousa",
  title =        "The {CRNS} framework and its application to
                 programmable and reconfigurable cryptography",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400692",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article proposes the Computing with the
                 ResidueNumber System (CRNS) framework, which aims at
                 the design automation of accelerators for Modular
                 Arithmetic (MA). The framework provides a comprehensive
                 set of tools ranging from a programming language and
                 respective compiler to back-ends targeting parallel
                 computation platforms such as Graphical Processing
                 Units (GPUs) and reconfigurable hardware. Given an
                 input algorithm described with a high-level programming
                 language, the CRNS can be used to obtain in a few
                 seconds the corresponding optimized Parallel Thread
                 Execution (PTX) program ready to be run on GPUs or the
                 Hardware Description Language (HDL) specification of a
                 fully functional accelerator suitable for
                 reconfigurable hardware and embedded systems. The
                 resulting framework's implementations benefit from the
                 Residue Number System (RNS) arithmetic's
                 parallelization properties in a fully automated way.
                 Designers do not need to be familiar with the
                 mathematical details concerning the employed
                 arithmetic, namely the RNS representation. In order to
                 thoroughly describe and evaluate the proposed
                 framework, experimental results obtained for the
                 supported back-ends (GPU and HDL) are presented
                 targeting the implementation of the modular
                 exponentiation used in the Rivest-Shamir-Adleman (RSA)
                 algorithm and Elliptic Curve (EC) point multiplication.
                 Results suggest competitive latency and throughput with
                 minimum design effort and overcoming all the
                 development issues that arise in the specification and
                 verification of dedicated solutions.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Diouf:2013:DLM,
  author =       "Boubacar Diouf and Can Hantas and Albert Cohen and
                 {\"O}zcan {\"O}zturk and Jens Palsberg",
  title =        "A decoupled local memory allocator",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400693",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compilers use software-controlled local memories to
                 provide fast, predictable, and power-efficient access
                 to critical data. We show that the local memory
                 allocation for straight-line, or linearized programs is
                 equivalent to a weighted interval-graph coloring
                 problem. This problem is new when allowing a color
                 interval to ``wrap around,'' and we call it the
                 submarine-building problem. This graph-theoretical
                 decision problem differs slightly from the classical
                 ship-building problem, and exhibits very interesting
                 and unusual complexity properties. We demonstrate that
                 the submarine-building problem is NP-complete, while it
                 is solvable in linear time for not-so-proper interval
                 graphs, an extension of the the class of proper
                 interval graphs. We propose a clustering heuristic to
                 approximate any interval graph into a not-so-proper
                 interval graph, decoupling spill code generation from
                 local memory assignment. We apply this heuristic to a
                 large number of randomly generated interval graphs
                 reproducing the statistical features of standard local
                 memory allocation benchmarks, comparing with
                 state-of-the-art heuristics.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cui:2013:LOC,
  author =       "Huimin Cui and Qing Yi and Jingling Xue and Xiaobing
                 Feng",
  title =        "Layout-oblivious compiler optimization for matrix
                 computations",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400694",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Most scientific computations serve to apply
                 mathematical operations to a set of preconceived data
                 structures, e.g., matrices, vectors, and grids. In this
                 article, we use a number of widely used matrix
                 computations from the LINPACK library to demonstrate
                 that complex internal organizations of data structures
                 can severely degrade the effectiveness of compiler
                 optimizations. We then present a data-layout-oblivious
                 optimization methodology, where by isolating an
                 abstract representation of the computations from
                 complex implementation details of their data, we enable
                 these computations to be much more accurately analyzed
                 and optimized through varying state-of-the-art compiler
                 technologies. We evaluated our approach on an Intel
                 8-core platform using two source-to-source compiler
                 infrastructures, Pluto and EPOD. Our results show that
                 while the efficiency of a computational kernel differs
                 when using different data layouts, the alternative
                 implementations typically benefit from a common set of
                 optimizations on the operations. Therefore separately
                 optimizing the operations and the data layout of a
                 computation could dramatically enhance the
                 effectiveness of compiler optimizations compared with
                 the conventional approaches of using a unified
                 representation.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dolan:2013:CSL,
  author =       "Stephen Dolan and Servesh Muralidharan and David
                 Gregg",
  title =        "Compiler support for lightweight context switching",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400695",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose a new language-neutral primitive for the
                 LLVM compiler, which provides efficient context
                 switching and message passing between lightweight
                 threads of control. The primitive, called Swapstack,
                 can be used by any language implementation based on
                 LLVM to build higher-level language structures such as
                 continuations, coroutines, and lightweight threads. As
                 part of adding the primitives to LLVM, we have also
                 added compiler support for passing parameters across
                 context switches. Our modified LLVM compiler produces
                 highly efficient code through a combination of exposing
                 the context switching code to existing compiler
                 optimizations, and adding novel compiler optimizations
                 to further reduce the cost of context switches. To
                 demonstrate the generality and efficiency of our
                 primitives, we add one-shot continuations to C++, and
                 provide a simple fiber library that allows millions of
                 fibers to run on multiple cores, with a work-stealing
                 scheduler and fast inter-fiber sychronization. We argue
                 that compiler-supported lightweight context switching
                 can be significantly faster than using a library to
                 switch between contexts, and provide experimental
                 evidence to support the position.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Abad:2013:LLE,
  author =       "Pablo Abad and Valentin Puente and Jose-Angel
                 Gregorio",
  title =        "{LIGERO}: a light but efficient router conceived for
                 cache-coherent chip multiprocessors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400696",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Although abstraction is the best approach to deal with
                 computing system complexity, sometimes implementation
                 details should be considered. Considering on-chip
                 interconnection networks in particular, underestimating
                 the underlying system specificity could have
                 nonnegligible impact on performance, cost, or
                 correctness. This article presents a very efficient
                 router that has been devised to deal with
                 cache-coherent chip multiprocessor particularities in a
                 balanced way. Employing the same principles of packet
                 rotation structures as in the rotary router, we present
                 a router configuration with the following novel
                 features: (1) reduced buffering requirements, (2)
                 optimized pipeline under contentionless conditions, (3)
                 more efficient deadlock avoidance mechanism, and (4)
                 optimized in-order delivery guarantee. Putting it all
                 together, our proposal provides a set of features that
                 no other router, to the best of our knowledge, has
                 achieved previously. These are: (1') low implementation
                 cost, (2') low pass-through latency under low load,
                 (3') improved resource utilization through adaptive
                 routing and a buffering scheme free of head-of-line
                 blocking, (4') guarantee of coherence protocol
                 correctness via end-to-end deadlock avoidance and
                 in-order delivery, and (5') improvement of coherence
                 protocol responsiveness through adaptive in-network
                 multicast support. We conduct a thorough evaluation
                 that includes hardware cost estimation and performance
                 evaluation under a wide spectrum of realistic workloads
                 and coherence protocols. Comparing our proposal with
                 VCTM, an optimized state-of-the-art wormhole router, it
                 requires 50\% less area, reduces on-chip cache
                 hierarchy energy delay product on average by 20\%, and
                 improves the cache-coherency chip multiprocessor
                 performance under realistic working conditions by up to
                 20\%.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Albericio:2013:ERL,
  author =       "Jorge Albericio and Pablo Ib{\'a}{\~n}ez and
                 V{\'\i}ctor Vi{\~n}als and Jose Mar{\'\i}a
                 Llaber{\'\i}a",
  title =        "Exploiting reuse locality on inclusive shared
                 last-level caches",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400697",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Optimization of the replacement policy used for Shared
                 Last-Level Cache (SLLC) management in a
                 Chip-MultiProcessor (CMP) is critical for avoiding
                 off-chip accesses. Temporal locality, while being
                 exploited by first levels of private cache memories, is
                 only slightly exhibited by the stream of references
                 arriving at the SLLC. Thus, traditional replacement
                 algorithms based on recency are bad choices for
                 governing SLLC replacement. Recent proposals involve
                 SLLC replacement policies that attempt to exploit reuse
                 either by segmenting the replacement list or improving
                 the rereference interval prediction. On the other hand,
                 inclusive SLLCs are commonplace in the CMP market, but
                 the interaction between replacement policy and the
                 enforcement of inclusion has barely been discussed.
                 After analyzing that interaction, this article
                 introduces two simple replacement policies exploiting
                 reuse locality and targeting inclusive SLLCs: Least
                 Recently Reused (LRR) and Not Recently Reused (NRR).
                 NRR has the same implementation cost as NRU, and LRR
                 only adds one bit per line to the LRU cost. After
                 considering reuse locality and its interaction with the
                 invalidations induced by inclusion, the proposals are
                 evaluated by simulating multiprogrammed workloads in an
                 8-core system with two private cache levels and an
                 SLLC. LRR outperforms LRU by 4.5\% (performing better
                 in 97 out of 100 mixes) and NRR outperforms NRU by
                 4.2\% (performing better in 99 out of 100 mixes). We
                 also show that our mechanisms outperform rereference
                 interval prediction, a recently proposed SLLC
                 replacement policy and that similar conclusions can be
                 drawn by varying the associativity or the SLLC size.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yiapanis:2013:OSR,
  author =       "Paraskevas Yiapanis and Demian Rosas-Ham and Gavin
                 Brown and Mikel Luj{\'a}n",
  title =        "Optimizing software runtime systems for speculative
                 parallelization",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400698",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Thread-Level Speculation (TLS) overcomes limitations
                 intrinsic with conservative compile-time
                 auto-parallelizing tools by extracting parallel threads
                 optimistically and only ensuring absence of data
                 dependence violations at runtime. A significant barrier
                 for adopting TLS (implemented in software) is the
                 overheads associated with maintaining speculative
                 state. Based on previous TLS limit studies, we observe
                 that on future multicore systems we will likely have
                 more cores idle than those which traditional TLS would
                 be able to harness. This implies that a TLS system
                 should focus on optimizing for small number of cores
                 and find efficient ways to take advantage of the idle
                 cores. Furthermore, research on optimistic systems has
                 covered two important implementation design points:
                 eager vs. lazy version management. With this knowledge,
                 we propose new simple and effective techniques to
                 reduce the execution time overheads for both of these
                 design points. This article describes a novel compact
                 version management data structure optimized for space
                 overhead when using a small number of TLS threads.
                 Furthermore, we describe two novel software runtime
                 parallelization systems that utilize this compact data
                 structure. The first software TLS system, MiniTLS,
                 relies on eager memory data management (in-place
                 updates) and, thus, when a misspeculation occurs a
                 rollback process is required. MiniTLS takes advantage
                 of the novel compact version management representation
                 to parallelize the rollback process and is able to
                 recover from misspeculation faster than existing
                 software eager TLS systems. The second one, Lector
                 (Lazy inspECTOR) is based on lazy version management.
                 Since we have idle cores, the question is whether we
                 can create ``helper'' tasks to determine whether
                 speculation is actually needed without stopping or
                 damaging the speculative execution. In Lector, for each
                 conventional TLS thread running speculatively with lazy
                 version management, there is associated with it a
                 lightweight inspector. The inspector threads execute
                 alongside to verify quickly whether data dependencies
                 will occur. Inspector threads are generated by standard
                 techniques for inspector/executor parallelization. We
                 have applied both TLS systems to seven Java sequential
                 benchmarks, including three benchmarks from
                 SPECjvm2008. Two out of the seven benchmarks exhibit
                 misspeculations. MiniTLS experiments report average
                 speedups of 1.8x for 4 threads increasing close to 7x
                 speedups with 32 threads. Facilitated by our novel
                 compact representation, MiniTLS reduces the space
                 overhead over state-of-the-art software TLS systems
                 between 96\% on 2 threads and 40\% on 32 threads. The
                 experiments for Lector, report average speedups of 1.7x
                 for 2 threads (that is 1 TLS + 1 Inspector threads)
                 increasing close to 8.2x speedups with 32 threads (16 +
                 16 threads). Compared to a well established software
                 TLS baseline, Lector performs on average 1.7x faster
                 for 32 threads and in no case ( x TLS + x Inspector
                 threads) Lector delivers worse performance than the
                 baseline TLS with the equivalent number of TLS threads
                 (i.e. x TLS threads) nor doubling the equivalent number
                 of TLS threads (i.e., x + x TLS threads).",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nugteren:2013:ASC,
  author =       "Cedric Nugteren and Pieter Custers and Henk
                 Corporaal",
  title =        "Algorithmic species: a classification of affine loop
                 nests for parallel programming",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400699",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Code generation and programming have become ever more
                 challenging over the last decade due to the shift
                 towards parallel processing. Emerging processor
                 architectures such as multi-cores and GPUs exploit
                 increasingly parallelism, requiring programmers and
                 compilers to deal with aspects such as threading,
                 concurrency, synchronization, and complex memory
                 partitioning. We advocate that programmers and
                 compilers can greatly benefit from a structured
                 classification of program code. Such a classification
                 can help programmers to find opportunities for
                 parallelization, reason about their code, and interact
                 with other programmers. Similarly, parallelising
                 compilers and source-to-source compilers can take
                 threading and optimization decisions based on the same
                 classification. In this work, we introduce algorithmic
                 species, a classification of affine loop nests based on
                 the polyhedral model and targeted for both automatic
                 and manual use. Individual classes capture information
                 such as the structure of parallelism and the data
                 reuse. To make the classification applicable for manual
                 use, a basic vocabulary forms the base for the creation
                 of a set of intuitive classes. To demonstrate the use
                 of algorithmic species, we identify 115 classes in a
                 benchmark set. Additionally, we demonstrate the
                 suitability of algorithmic species for automated uses
                 by showing a tool to automatically extract species from
                 program code, a species-based source-to-source
                 compiler, and a species-based performance prediction
                 model.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gerards:2013:ODD,
  author =       "Marco E. T. Gerards and Jan Kuper",
  title =        "Optimal {DPM} and {DVFS} for frame-based real-time
                 systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400700",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic Power Management (DPM) and Dynamic Voltage and
                 Frequency Scaling (DVFS) are popular techniques for
                 reducing energy consumption. Algorithms for optimal
                 DVFS exist, but optimal DPM and the optimal combination
                 of DVFS and DPM are not yet solved. In this article we
                 use well-established models of DPM and DVFS for
                 frame-based systems. We show that it is not
                 sufficient-as some authors argue-to consider only
                 individual invocations of a task. We define a schedule
                 that also takes interactions between invocations into
                 account and prove-in a theoretical fashion-that this
                 schedule is optimal.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yan:2013:IPA,
  author =       "Zhichao Yan and Hong Jiang and Yujuan Tan and Dan
                 Feng",
  title =        "An integrated pseudo-associativity and relaxed-order
                 approach to hardware transactional memory",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400701",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Our experimental study and analysis reveal that the
                 bottlenecks of existing hardware transactional memory
                 systems are largely rooted in the extra data movements
                 in version management and in the inefficient scheduling
                 of conflicting transactions in conflict management,
                 particularly in the presence of high-contention and
                 coarse-grained applications. In order to address this
                 problem, we propose an integrated Pseudo-Associativity
                 and Relaxed-Order approach to hardware Transactional
                 Memory, called PARO-TM. It exploits the extra
                 pseudo-associative space in the data cache to hold the
                 new value of each transactional modification, and
                 maintains the mappings between the old and new versions
                 via an implicit pseudo-associative hash algorithm
                 (i.e., by inverting the specific bit of the SET index).
                 PARO-TM can branch out the speculative version from the
                 old version upon each transactional modification on
                 demand without a dedicated hardware component to hold
                 the uncommitted data. This means that it is able to
                 automatically access the proper version upon the
                 transaction's commit or abort. Moreover, PARO-TM
                 augments multi-version support in a chained directory
                 to schedule conflicting transactions in a relaxed-order
                 manner to further reduce their overheads. We compare
                 PARO-TM with the state-of-the-art LogTM-SE, TCC, DynTM,
                 and SUV-TM systems and find that PARO-TM consistently
                 outperforms these four representative HTMs. This
                 performance advantage of PARO-TM is far more pronounced
                 under the high-contention and coarse-grained
                 applications in the STAMP benchmark suite, for which
                 PARO-TM is motivated and designed.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:PGF,
  author =       "Doris Chen and Deshanand Singh",
  title =        "Profile-guided floating- to fixed-point conversion for
                 hybrid {FPGA}-processor applications",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400702",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The key to enabling widespread use of FPGAs for
                 algorithm acceleration is to allow programmers to
                 create efficient designs without the time-consuming
                 hardware design process. Programmers are used to
                 developing scientific and mathematical algorithms in
                 high-level languages (C/C++) using floating point data
                 types. Although easy to implement, the dynamic range
                 provided by floating point is not necessary in many
                 applications; more efficient implementations can be
                 realized using fixed point arithmetic. While this topic
                 has been studied previously [Han et al. 2006; Olson et
                 al. 1999; Gaffar et al. 2004; Aamodt and Chow 1999],
                 the degree of full automation has always been lacking.
                 We present a novel design flow for cases where FPGAs
                 are used to offload computations from a microprocessor.
                 Our LLVM-based algorithm inserts value profiling code
                 into an unmodified C/C++ application to guide its
                 automatic conversion to fixed point. This allows for
                 fast and accurate design space exploration on a host
                 microprocessor before any accelerators are mapped to
                 the FPGA. Through experimental results, we demonstrate
                 that fixed-point conversion can yield resource savings
                 of up to 2x--3x reductions. Embedded RAM usage is
                 minimized, and 13\%--22\% higher $F_{\rm max}$ than the
                 original floating-point implementation is observed. In
                 a case study, we show that 17\% reduction in logic and
                 24\% reduction in register usage can be realized by
                 using our algorithm in conjunction with a High-Level
                 Synthesis (HLS) tool.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Cui:2013:LCA,
  author =       "Yan Cui and Yingxin Wang and Yu Chen and Yuanchun
                 Shi",
  title =        "Lock-contention-aware scheduler: a scalable and
                 energy-efficient method for addressing scalability
                 collapse on multicore systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400703",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In response to the increasing ubiquity of multicore
                 processors, there has been widespread development of
                 multithreaded applications that strive to realize their
                 full potential. Unfortunately, lock contention within
                 operating systems can limit the scalability of
                 multicore systems so severely that an increase in the
                 number of cores can actually lead to reduced
                 performance (i.e., scalability collapse). Existing
                 efforts of solving scalability collapse mainly focus on
                 making critical sections of kernel code fine-grained or
                 designing new synchronization primitives. However,
                 these methods have disadvantages in scalability or
                 energy efficiency. In this article, we observe that the
                 percentage of lock-waiting time over the total
                 execution time for a lock intensive task has a
                 significant correlation with the occurrence of
                 scalability collapse. Based on this observation, a
                 lock-contention-aware scheduler is proposed.
                 Specifically, each task in the scheduler monitors its
                 percentage of lock waiting time continuously. If the
                 percentage exceeds a predefined threshold, this task is
                 considered as lock intensive and migrated to a Special
                 Set of Cores (i.e., SSC). In this way, the number of
                 concurrently running lock-intensive tasks is limited to
                 the number of cores in the SSC, and therefore, the
                 degree of lock contention is controlled. A central
                 challenge of using this scheme is how many cores should
                 be allocated in the SSC to handle lock-intensive tasks.
                 In our scheduler, the optimal number of cores is
                 determined online by the model-driven search. The
                 proposed scheduler is implemented in the recent Linux
                 kernel and evaluated using micro- and macrobenchmarks
                 on AMD and Intel 32-core systems. Experimental results
                 suggest that our proposal is able to remove scalability
                 collapse completely and sustains the maximal throughput
                 of the spin-lock-based system for most applications.
                 Furthermore, the percentage of lock-waiting time can be
                 reduced by up to 84\%. When compared with scalability
                 collapse reduction methods such as requester-based
                 locking scheme and sleeping-based synchronization
                 primitives, our scheme exhibits significant advantages
                 in scalability, power consumption, and energy
                 efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pusukuri:2013:AFC,
  author =       "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
                 Bhuyan",
  title =        "{ADAPT}: a framework for coscheduling multithreaded
                 programs",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400704",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Since multicore systems offer greater performance via
                 parallelism, future computing is progressing towards
                 use of multicore machines with large number of cores.
                 However, the performance of emerging multithreaded
                 programs often does not scale to fully utilize the
                 available cores. Therefore, simultaneously running
                 multiple multithreaded applications becomes inevitable
                 to fully exploit the computing potential of such
                 machines. However, maximizing the performance and
                 throughput on multicore machines in the presence of
                 multiple multithreaded programs is a challenge for the
                 OS. We have observed that the state-of-the-art
                 contention management algorithms fail to effectively
                 coschedule multithreaded programs on multicore
                 machines. To address the above challenge, we present
                 ADAPT, a scheduling framework that continuously
                 monitors the resource usage of multithreaded programs
                 and adaptively coschedules them such that they
                 interfere with each other's performance as little as
                 possible. In addition, ADAPT selects appropriate memory
                 allocation and scheduling policies according to the
                 workload characteristics. We have implemented ADAPT on
                 a 64-core Supermicro server running Solaris 11 and
                 evaluated it using 26 multithreaded programs including
                 the TATP database application, SPECjbb2005, and
                 programs from Phoenix, PARSEC, and SPEC OMP suites. The
                 experimental results show that ADAPT substantially
                 improves total turnaround time and system utilization
                 relative to the default Solaris 11 scheduler.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tartara:2013:CLC,
  author =       "Michele Tartara and Stefano Crespi Reghizzi",
  title =        "Continuous learning of compiler heuristics",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400705",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Optimizing programs to exploit the underlying hardware
                 architecture is an important task. Much research has
                 been done on enabling compilers to find the best set of
                 code optimizations that can build the fastest and less
                 resource-hungry executable for a given program. A
                 common approach is iterative compilation, sometimes
                 enriched by machine learning techniques. This provides
                 good results, but requires extremely long compilation
                 times and an initial training phase lasting even for
                 days or weeks. We present long-term learning, a new
                 algorithm that allows the compiler user to improve the
                 performance of compiled programs with reduced
                 compilation times with respect to iterative
                 compilation, and without an initial training phase. Our
                 algorithm does not just build good programs: it
                 acquires knowledge every time a program is compiled and
                 it uses such knowledge to learn compiler heuristics,
                 without the need for an expert to manually define them.
                 The heuristics are evolved during every compilation, by
                 evaluating their effect on the generated programs. We
                 present implementations of long-term learning on top of
                 two different compilers, and experimental data gathered
                 on multiple hardware configurations showing its
                 effectiveness.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chrysos:2013:HCP,
  author =       "Grigorios Chrysos and Panagiotis Dagritzikos and
                 Ioannis Papaefstathiou and Apostolos Dollas",
  title =        "{HC-CART}: a parallel system implementation of data
                 mining classification and regression tree {(CART)}
                 algorithm on a multi-{FPGA} system",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400706",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Data mining is a new field of computer science with a
                 wide range of applications. Its goal is to extract
                 knowledge from massive datasets in a
                 human-understandable structure, for example, the
                 decision trees. In this article we present an
                 innovative, high-performance, system-level architecture
                 for the Classification And Regression Tree (CART)
                 algorithm, one of the most important and widely used
                 algorithms in the data mining area. Our proposed
                 architecture exploits parallelism at the decision
                 variable level, and was fully implemented and evaluated
                 on a modern high-performance reconfigurable platform,
                 the Convey HC-1 server, that features four FPGAs and a
                 multicore processor. Our FPGA-based implementation was
                 integrated with the widely used ``rpart'' software
                 library of the R project in order to provide the first
                 fully functional reconfigurable system that can handle
                 real-world large databases. The proposed system, named
                 HC-CART system, achieves a performance speedup of up to
                 two orders of magnitude compared to well-known
                 single-threaded data mining software platforms, such as
                 WEKA and the R platform. It also outperforms similar
                 hardware systems which implement parts of the complete
                 application by an order of magnitude. Finally, we show
                 that the HC-CART system offers higher performance
                 speedup than some other proposed parallel software
                 implementations of decision tree construction
                 algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2013:DCD,
  author =       "Jongwon Lee and Yohan Ko and Kyoungwoo Lee and Jonghee
                 M. Youn and Yunheung Paek",
  title =        "Dynamic code duplication with vulnerability awareness
                 for soft error detection on {VLIW} architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400707",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Soft errors are becoming a critical concern in
                 embedded system designs. Code duplication techniques
                 have been proposed to increase the reliability in
                 multi-issue embedded systems such as VLIW by exploiting
                 empty slots for duplicated instructions. However, they
                 increase code size, another important concern, and
                 ignore vulnerability differences in instructions,
                 causing unnecessary or inefficient protection when
                 selecting instructions to be duplicated under
                 constraints. In this article, we propose a
                 compiler-assisted dynamic code duplication method to
                 minimize the code size overhead, and present
                 vulnerability-aware duplication algorithms to maximize
                 the effectiveness of instruction duplication with least
                 overheads for VLIW architecture. Our experimental
                 results with SoarGen and Synopsys simulation
                 environments demonstrate that our proposals can reduce
                 the code size by up to 40\% and detect more soft errors
                 by up to 82\% via fault injection experiments over
                 benchmarks from DSPstone and Livermore Loops as
                 compared to the previously proposed instruction
                 duplication technique.",
  acknowledgement = ack-nhfb,
  articleno =    "48",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Coelho:2013:ACI,
  author =       "Fabien Coelho and Fran{\c{c}}ois Irigoin",
  title =        "{API} compilation for image hardware accelerators",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400708",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present an API-based compilation strategy to
                 optimize image applications, developed using a
                 high-level image processing library, onto three
                 different image processing hardware accelerators. We
                 demonstrate that such a strategy is profitable for both
                 development cost and overall performance, especially as
                 it takes advantage of optimization opportunities across
                 library calls otherwise beyond reach. The library API
                 provides the semantics of the image computations. The
                 three image accelerator targets are quite distinct: the
                 first one uses a vector architecture; the second one
                 presents an SIMD architecture; the last one runs both
                 on GPGPU and multicores through OpenCL. We have adapted
                 standard compilation techniques to perform these
                 compilation and code generation tasks automatically.
                 Our strategy is implemented in PIPS, a source-to-source
                 compiler which greatly reduces the development cost as
                 standard phases are reused and parameterized. We
                 carried out experiments with applications on hardware
                 functional simulators and GPUs. Our contributions
                 include: (1) a general low-cost compilation strategy
                 for image processing applications, based on the
                 semantics provided by library calls, which improves
                 locality by an order of magnitude; (2) specific
                 heuristics to minimize execution time on the target
                 accelerators; (3) numerous experiments that show the
                 effectiveness of our strategies. We also discuss the
                 conditions required to extend this approach to other
                 application domains.",
  acknowledgement = ack-nhfb,
  articleno =    "49",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luque:2013:FCT,
  author =       "Carlos Luque and Miquel Moreto and Francisco J.
                 Cazorla and Mateo Valero",
  title =        "Fair {CPU} time accounting in {CMP+SMT} processors",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "50:1--50:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400709",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Processor architectures combining several paradigms of
                 Thread-Level Parallelism (TLP), such as CMP processors
                 in which each core is SMT, are becoming more and more
                 popular as a way to improve performance at a moderate
                 cost. However, the complex interaction between running
                 tasks in hardware shared resources in multi-TLP
                 architectures introduces complexities when accounting
                 CPU time (or CPU utilization) to tasks. The CPU
                 utilization accounted to a task depends on both the
                 time it runs in the processor and the amount of
                 processor hardware resources it receives. Deploying
                 systems with accurate CPU accounting mechanisms is
                 necessary to increase fairness. Moreover, it will allow
                 users to be fairly charged on a shared data center,
                 facilitating server consolidation in future systems. In
                 this article we analyze the accuracy and hardware cost
                 of previous CPU accounting mechanisms for pure-CMP and
                 pure-SMT processors and we show that they are not
                 adequate for CMP+SMT processors. Consequently, we
                 propose a new accounting mechanism for CMP+SMT
                 processors which: (1) increases the accuracy of
                 accounted CPU utilization; (2) provides much more
                 stable results over a wide range of processor setups;
                 and (3) does not require tracking all hardware shared
                 resources, significantly reducing its implementation
                 cost. In particular, previous proposals lead to
                 inaccuracies between 21\% and 79\% when measuring CPU
                 utilization in an 8-core 2-way SMT processor, while our
                 proposal reduces this inaccuracy to less than 5.0\%.",
  acknowledgement = ack-nhfb,
  articleno =    "50",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mattheakis:2013:SRM,
  author =       "Pavlos M. Mattheakis and Ioannis Papaefstathiou",
  title =        "Significantly reducing {MPI} intercommunication
                 latency and power overhead in both embedded and {HPC}
                 systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "51:1--51:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400710",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Highly parallel systems are becoming mainstream in a
                 wide range of sectors ranging from their traditional
                 stronghold high-performance computing, to data centers
                 and even embedded systems. However, despite the quantum
                 leaps of improvements in cost and performance of
                 individual components over the last decade (e.g.,
                 processor speeds, memory/interconnection bandwidth,
                 etc.), system manufacturers are still struggling to
                 deliver low-latency, highly scalable solutions. One of
                 the main reasons is that the intercommunication latency
                 grows significantly with the number of processor nodes.
                 This article presents a novel way to reduce this
                 intercommunication delay by implementing, in custom
                 hardware, certain communication tasks. In particular,
                 the proposed novel device implements the two most
                 widely used procedures of the most popular
                 communication protocol in parallel systems the Message
                 Passing Interface (MPI). Our novel approach has
                 initially been simulated within a pioneering parallel
                 systems simulation framework and then synthesized
                 directly from a high-level description language (i.e.,
                 SystemC) using a state-of-the-art synthesis tool. To
                 the best of our knowledge, this is the first article
                 presenting the complete hardware implementation of such
                 a system. The proposed novel approach triggers a
                 speedup from one to four orders of magnitude when
                 compared with conventional software-based solutions and
                 from one to three orders of magnitude when compared
                 with a sophisticated software-based approach. Moreover,
                 the performance of our system is from one to two orders
                 of magnitude higher than the simulated performance of a
                 similar but, relatively simpler hardware architecture;
                 at the same time the power consumption of our device is
                 about two orders of magnitude lower than that of a
                 low-power CPU when executing the exact same
                 intercommunication tasks.",
  acknowledgement = ack-nhfb,
  articleno =    "51",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Baghdadi:2013:ILT,
  author =       "Riyadh Baghdadi and Albert Cohen and Sven Verdoolaege
                 and Konrad Trifunovi{\'c}",
  title =        "Improved loop tiling based on the removal of spurious
                 false dependences",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "52:1--52:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400711",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To preserve the validity of loop nest transformations
                 and parallelization, data dependences need to be
                 analyzed. Memory dependences come in two varieties:
                 true dependences or false dependences. While true
                 dependences must be satisfied in order to preserve the
                 correct order of computations, false dependences are
                 induced by the reuse of a single memory location to
                 store multiple values. False dependences reduce the
                 degrees of freedom for loop transformations. In
                 particular, loop tiling is severely limited in the
                 presence of these dependences. While array expansion
                 removes all false dependences, the overhead on memory
                 and the detrimental impact on register-level reuse can
                 be catastrophic. We propose and evaluate a compilation
                 technique to safely ignore a large number of false
                 dependences in order to enable loop nest tiling in the
                 polyhedral model. It is based on the precise
                 characterization of interferences between live range
                 intervals, and it does not incur any scalar or array
                 expansion. Our algorithms have been implemented in the
                 Pluto polyhedral compiler, and evaluated on the
                 PolyBench suite.",
  acknowledgement = ack-nhfb,
  articleno =    "52",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Pop:2013:OED,
  author =       "Antoniu Pop and Albert Cohen",
  title =        "{OpenStream}: Expressiveness and data-flow compilation
                 of {OpenMP} streaming programs",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "53:1--53:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400712",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present OpenStream, a data-flow extension of OpenMP
                 to express dynamic dependent tasks. The language
                 supports nested task creation, modular composition,
                 variable and unbounded sets of producers/consumers, and
                 first-class streams. These features, enabled by our
                 original compilation flow, allow translating high-level
                 parallel programming patterns, like dependences arising
                 from StarSs' array regions, or universal low-level
                 primitives like futures. In particular, these dynamic
                 features can be embedded efficiently and naturally into
                 an unmanaged imperative language, avoiding the
                 complexity and overhead of a concurrent garbage
                 collector. We demonstrate the performance advantages of
                 a data-flow execution model compared to more restricted
                 task and barrier models. We also demonstrate the
                 efficiency of our compilation and runtime algorithms
                 for the support of complex dependence patterns arising
                 from StarSs benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Verdoolaege:2013:PPC,
  author =       "Sven Verdoolaege and Juan Carlos Juega and Albert
                 Cohen and Jos{\'e} Ignacio G{\'o}mez and Christian
                 Tenllado and Francky Catthoor",
  title =        "Polyhedral parallel code generation for {CUDA}",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "54:1--54:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400713",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article addresses the compilation of a sequential
                 program for parallel execution on a modern GPU. To this
                 end, we present a novel source-to-source compiler
                 called PPCG. PPCG singles out for its ability to
                 accelerate computations from any static control loop
                 nest, generating multiple CUDA kernels when necessary.
                 We introduce a multilevel tiling strategy and a code
                 generation scheme for the parallelization and locality
                 optimization of imperfectly nested loops, managing
                 memory and exposing concurrency according to the
                 constraints of modern GPUs. We evaluate our algorithms
                 and tool on the entire PolyBench suite.",
  acknowledgement = ack-nhfb,
  articleno =    "54",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Du:2013:DCC,
  author =       "Yu Du and Miao Zhou and Bruce Childers and Rami Melhem
                 and Daniel Moss{\'e}",
  title =        "Delta-compressed caching for overcoming the write
                 bandwidth limitation of hybrid main memory",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "55:1--55:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400714",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Limited PCM write bandwidth is a critical obstacle to
                 achieve good performance from hybrid DRAM/PCM memory
                 systems. The write bandwidth is severely restricted in
                 PCM devices, which harms application performance.
                 Indeed, as we show, it is more important to reduce PCM
                 write traffic than to reduce PCM read latency for
                 application performance. To reduce the number of PCM
                 writes, we propose a DRAM cache organization that
                 employs compression. A new delta compression technique
                 for modified data is used to achieve a large
                 compression ratio. Our approach can selectively and
                 predictively apply compression to improve its
                 efficiency and performance. Our approach is designed to
                 facilitate adoption in existing main memory compression
                 frameworks. We describe an instance of how to
                 incorporate delta compression in IBM's MXT memory
                 compression architecture when used for DRAM cache in a
                 hybrid main memory. For fourteen representative
                 memory-intensive workloads, on average, our delta
                 compression technique reduces the number of PCM writes
                 by 54.3\%, and improves IPC performance by 24.4\%.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Purini:2013:FGO,
  author =       "Suresh Purini and Lakshya Jain",
  title =        "Finding good optimization sequences covering program
                 space",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "56:1--56:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400715",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The compiler optimizations we enable and the order in
                 which we apply them on a program have a substantial
                 impact on the program execution time. Compilers provide
                 default optimization sequences which can give good
                 program speedup. As the default sequences have to
                 optimize programs with different characteristics, they
                 embed in them multiple subsequences which can optimize
                 different classes of programs. These multiple
                 subsequences may falsely interact with each other and
                 affect the potential program speedup achievable.
                 Instead of searching for a single universally optimal
                 sequence, we can construct a small set of good
                 sequences such that for every program class there
                 exists a near-optimal optimization sequence in the good
                 sequences set. If we can construct such a good
                 sequences set which covers all the program classes in
                 the program space, then we can choose the best sequence
                 for a program by trying all the sequences in the good
                 sequences set. This approach completely circumvents the
                 need to solve the program classification problem. Using
                 a sequence set size of around 10 we got an average
                 speedup up to 14\% on PolyBench programs and up to 12\%
                 on MiBench programs. Our approach is quite different
                 from either the iterative compilation or
                 machine-learning-based prediction modeling techniques
                 proposed in the literature so far. We use different
                 training and test datasets for cross-validation as
                 against the Leave-One-Out cross-validation technique.",
  acknowledgement = ack-nhfb,
  articleno =    "56",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Belviranli:2013:DSS,
  author =       "Mehmet E. Belviranli and Laxmi N. Bhuyan and Rajiv
                 Gupta",
  title =        "A dynamic self-scheduling scheme for heterogeneous
                 multiprocessor architectures",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "57:1--57:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400716",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Today's heterogeneous architectures bring together
                 multiple general-purpose CPUs and multiple
                 domain-specific GPUs and FPGAs to provide dramatic
                 speedup for many applications. However, the challenge
                 lies in utilizing these heterogeneous processors to
                 optimize overall application performance by minimizing
                 workload completion time. Operating system and
                 application development for these systems is in their
                 infancy. In this article, we propose a new scheduling
                 and workload balancing scheme, HDSS, for execution of
                 loops having dependent or independent iterations on
                 heterogeneous multiprocessor systems. The new algorithm
                 dynamically learns the computational power of each
                 processor during an adaptive phase and then schedules
                 the remainder of the workload using a weighted
                 self-scheduling scheme during the completion phase.
                 Different from previous studies, our scheme uniquely
                 considers the runtime effects of block sizes on the
                 performance for heterogeneous multiprocessors. It finds
                 the right trade-off between large and small block sizes
                 to maintain balanced workload while keeping the
                 accelerator utilization at maximum. Our algorithm does
                 not require offline training or architecture-specific
                 parameters. We have evaluated our scheme on two
                 different heterogeneous architectures: AMD 64-core
                 Bulldozer system with nVidia Fermi C2050 GPU and Intel
                 Xeon 32-core SGI Altix 4700 supercomputer with Xilinx
                 Virtex 4 FPGAs. The experimental results show that our
                 new scheduling algorithm can achieve performance
                 improvements up to over 200\% when compared to the
                 closest existing load balancing scheme. Our algorithm
                 also achieves full processor utilization with all
                 processors completing at nearly the same time which is
                 significantly better than alternative current
                 approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "57",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Negi:2013:SCF,
  author =       "Anurag Negi and Ruben Titos-Gil",
  title =        "{SCIN-cache}: Fast speculative versioning in
                 multithreaded cores",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "58:1--58:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400717",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes cache designs for efficiently
                 supporting speculative techniques like transactional
                 memory on chip multiprocessors with multithreaded
                 cores. On-demand allocation and prompt freeing of
                 speculative cache space in the design reduces the
                 burden on nonspeculative execution. Quick access to
                 both clean and speculative versions of data for
                 multiple contexts provides flexibility and greater
                 design freedom to HTM architects. Performance analysis
                 shows the designs stand up well against other HTM
                 design proposals, with potential performance gains in
                 high contention applications with small transactions.",
  acknowledgement = ack-nhfb,
  articleno =    "58",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lutz:2013:PAF,
  author =       "Thibaut Lutz and Christian Fensch and Murray Cole",
  title =        "{PARTANS}: an autotuning framework for stencil
                 computation on multi-{GPU} systems",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "59:1--59:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400718",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "GPGPUs are a powerful and energy-efficient solution
                 for many problems. For higher performance or larger
                 problems, it is necessary to distribute the problem
                 across multiple GPUs, increasing the already high
                 programming complexity. In this article, we focus on
                 abstracting the complexity of multi-GPU programming for
                 stencil computation. We show that the best strategy
                 depends not only on the stencil operator, problem size,
                 and GPU, but also on the PCI express layout. This adds
                 nonuniform characteristics to a seemingly homogeneous
                 setup, causing up to 23\% performance loss. We address
                 this issue with an autotuner that optimizes the
                 distribution across multiple GPUs.",
  acknowledgement = ack-nhfb,
  articleno =    "59",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Xiao:2013:SAT,
  author =       "Chunhua Xiao and M-C. Frank Chang and Jason Cong and
                 Michael Gill and Zhangqin Huang and Chunyue Liu and
                 Glenn Reinman and Hao Wu",
  title =        "Stream arbitration: Towards efficient bandwidth
                 utilization for emerging on-chip interconnects",
  journal =      j-TACO,
  volume =       "9",
  number =       "4",
  pages =        "60:1--60:??",
  month =        jan,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2400682.2400719",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Jan 18 10:57:16 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Alternative interconnects are attractive for scaling
                 on-chip communication bandwidth in a power-efficient
                 manner. However, efficient utilization of the bandwidth
                 provided by these emerging interconnects still remains
                 an open problem due to the spatial and temporal
                 communication heterogeneity. In this article, a Stream
                 Arbitration scheme is proposed, where at runtime any
                 source can compete for any communication channel of the
                 interconnect to talk to any destination. We apply
                 stream arbitration to radio frequency interconnect
                 (RF-I). Experimental results show that compared to the
                 representative token arbitration scheme, stream
                 arbitration can provide an average 20\% performance
                 improvement and 12\% power reduction.",
  acknowledgement = ack-nhfb,
  articleno =    "60",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:DRU,
  author =       "Yunji Chen and Tianshi Chen and Ling Li and Ruiyang Wu
                 and Daofu Liu and Weiwu Hu",
  title =        "Deterministic Replay Using Global Clock",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2445572.2445573",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Debugging parallel programs is a well-known difficult
                 problem. A promising method to facilitate debugging
                 parallel programs is using hardware support to achieve
                 deterministic replay on a Chip Multi-Processor (CMP).
                 As a Design-For-Debug (DFD) feature, a practical
                 hardware-assisted deterministic replay scheme should
                 have low design and verification costs, as well as a
                 small log size. To achieve these goals, we propose a
                 novel and succinct hardware-assisted deterministic
                 replay scheme named LReplay. The key innovation of
                 LReplay is that instead of recording the logical time
                 orders between instructions or instruction blocks as
                 previous investigations, LReplay is built upon
                 recording the pending period information infused by the
                 global clock. By the recorded pending period
                 information, about 99\% execution orders are
                 inferrable, implying that LReplay only needs to record
                 directly the residual 1\% noninferrable execution
                 orders in production run. The 1\% noninferrable orders
                 can be addressed by a simple yet cost-effective
                 direction prediction technique, which further reduces
                 the log size of LReplay. Benefiting from the preceding
                 innovations, the overall log size of LReplay over
                 SPLASH-2 benchmarks is about 0.17B/K-Inst (byte per
                 k-instruction) for the sequential consistency, and
                 0.57B/K-Inst for the Godson-3 consistency. Such log
                 sizes are smaller in an order of magnitude than
                 previous deterministic replay schemes incurring no
                 performance loss. Furthermore, LReplay only consumes
                 about 0.5\% area of the Godson-3 CMP, since it requires
                 only trivial modifications to existing components of
                 Godson-3. The features of LReplay demonstrate the
                 potential of integrating hardware support for
                 deterministic replay into future industrial
                 processors.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lustig:2013:TIC,
  author =       "Daniel Lustig and Abhishek Bhattacharjee and Margaret
                 Martonosi",
  title =        "{TLB} Improvements for Chip Multiprocessors:
                 Inter-Core Cooperative Prefetchers and Shared
                 Last-Level {TLBs}",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2445572.2445574",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Translation Lookaside Buffers (TLBs) are critical to
                 overall system performance. Much past research has
                 addressed uniprocessor TLBs, lowering access times and
                 miss rates. However, as Chip MultiProcessors (CMPs)
                 become ubiquitous, TLB design and performance must be
                 reevaluated. Our article begins by performing a
                 thorough TLB performance evaluation of sequential and
                 parallel benchmarks running on a real-world, modern CMP
                 system using hardware performance counters. This
                 analysis demonstrates the need for further improvement
                 of TLB hit rates for both classes of application, and
                 it also points out that the data TLB has a
                 significantly higher miss rate than the instruction TLB
                 in both cases. In response to the characterization
                 data, we propose and evaluate both Inter-Core
                 Cooperative (ICC) TLB prefetchers and Shared Last-Level
                 (SLL) TLBs as alternatives to the commercial norm of
                 private, per-core L2 TLBs. ICC prefetchers eliminate
                 19\% to 90\% of Data TLB (D-TLB) misses across parallel
                 workloads while requiring only modest changes in
                 hardware. SLL TLBs eliminate 7\% to 79\% of D-TLB
                 misses for parallel workloads and 35\% to 95\% of D-TLB
                 misses for multiprogrammed sequential workloads. This
                 corresponds to 27\% and 21\% increases in hit rates as
                 compared to private, per-core L2 TLBs, respectively,
                 and is achieved this using even more modest hardware
                 requirements. Because of their benefits for parallel
                 applications, their applicability to sequential
                 workloads, and their readily implementable hardware,
                 SLL TLBs and ICC TLB prefetchers hold great promise for
                 CMPs.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:TME,
  author =       "Rong Chen and Haibo Chen",
  title =        "{Tiled-MapReduce}: Efficient and Flexible {MapReduce}
                 Processing on Multicore with Tiling",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2445572.2445575",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The prevalence of chip multiprocessors opens
                 opportunities of running data-parallel applications
                 originally in clusters on a single machine with many
                 cores. MapReduce, a simple and elegant programming
                 model to program large-scale clusters, has recently
                 been shown a promising alternative to harness the
                 multicore platform. The differences such as memory
                 hierarchy and communication patterns between clusters
                 and multicore platforms raise new challenges to design
                 and implement an efficient MapReduce system on
                 multicore. This article argues that it is more
                 efficient for MapReduce to iteratively process small
                 chunks of data in turn than processing a large chunk of
                 data at a time on shared memory multicore platforms.
                 Based on the argument, we extend the general MapReduce
                 programming model with a ``tiling strategy'', called
                 Tiled --- MapReduce (TMR). TMR partitions a large
                 MapReduce job into a number of small subjobs and
                 iteratively processes one subjob at a time with
                 efficient use of resources; TMR finally merges the
                 results of all subjobs for output. Based on
                 Tiled-MapReduce, we design and implement several
                 optimizing techniques targeting multicore, including
                 the reuse of the input buffer among subjobs, a
                 NUCA/NUMA-aware scheduler, and pipelining a subjob's
                 reduce phase with the successive subjob's map phase, to
                 optimize the memory, cache, and CPU resources
                 accordingly. Further, we demonstrate that
                 Tiled-MapReduce supports fine-grained fault tolerance
                 and enables several usage scenarios such as online and
                 incremental computing on multicore machines.
                 Performance evaluation with our prototype system called
                 Ostrich on a 48-core machine shows that Ostrich saves
                 up to 87.6\% memory, causes less cache misses, and
                 makes more efficient use of CPU cores, resulting in a
                 speedup ranging from 1.86x to 3.07x over Phoenix.
                 Ostrich also efficiently supports fine-grained fault
                 tolerance, online, and incremental computing with small
                 performance penalty.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Becchi:2013:DTS,
  author =       "Michela Becchi and Patrick Crowley",
  title =        "{A-DFA}: a Time- and Space-Efficient {DFA} Compression
                 Algorithm for Fast Regular Expression Evaluation",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:26",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2445572.2445576",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Modern network intrusion detection systems need to
                 perform regular expression matching at line rate in
                 order to detect the occurrence of critical patterns in
                 packet payloads. While Deterministic Finite Automata
                 (DFAs) allow this operation to be performed in linear
                 time, they may exhibit prohibitive memory requirements.
                 Kumar et al. [2006a] have proposed Delayed Input DFAs
                 (D2FAs), which provide a trade-off between the memory
                 requirements of the compressed DFA and the number of
                 states visited for each character processed, which in
                 turn affects the memory bandwidth required to evaluate
                 regular expressions. In this article we introduce
                 Amortized time --- bandwidth overhead DFAs ( A --- DFAs
                 ), a general compression technique that results in at
                 most N ( k + 1)/ k state traversals when processing a
                 string of length N, k being a positive integer. In
                 comparison to the D2FA approach, our technique achieves
                 comparable levels of compression with lower provable
                 bounds on memory bandwidth (or greater compression for
                 a given bandwidth bound). Moreover, the A-DFA algorithm
                 has lower complexity, can be applied during DFA
                 creation, and is suitable for scenarios where a
                 compressed DFA needs to be dynamically built or
                 updated. Finally, we show how to combine A-DFA with
                 alphabet reduction and multistride DFAs, two techniques
                 aimed at reducing the memory space and bandwidth
                 requirement of DFAs, and discuss memory encoding
                 schemes suitable for A-DFAs.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2013:MFM,
  author =       "Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay
                 B. Brockman and Dean M. Tullsen and Norman P. Jouppi",
  title =        "The {McPAT} Framework for Multicore and Manycore
                 Architectures: Simultaneously Modeling Power, Area, and
                 Timing",
  journal =      j-TACO,
  volume =       "10",
  number =       "1",
  pages =        "5:1--5:??",
  month =        apr,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2445572.2445577",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Apr 5 18:36:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article introduces McPAT, an integrated power,
                 area, and timing modeling framework that supports
                 comprehensive design space exploration for multicore
                 and manycore processor configurations ranging from 90nm
                 to 22nm and beyond. At microarchitectural level, McPAT
                 includes models for the fundamental components of a
                 complete chip multiprocessor, including in-order and
                 out-of-order processor cores, networks-on-chip, shared
                 caches, and integrated system components such as memory
                 controllers and Ethernet controllers. At circuit level,
                 McPAT supports detailed modeling of critical-path
                 timing, area, and power. At technology level, McPAT
                 models timing, area, and power for the device types
                 forecast in the ITRS roadmap. McPAT has a flexible XML
                 interface to facilitate its use with many performance
                 simulators. Combined with a performance simulator,
                 McPAT enables architects to accurately quantify the
                 cost of new ideas and assess trade-offs of different
                 architectures using new metrics such as
                 Energy-Delay-Area2 Product (EDA2P) and
                 Energy-Delay-Area Product (EDAP). This article explores
                 the interconnect options of future manycore processors
                 by varying the degree of clustering over generations of
                 process technologies. Clustering will bring interesting
                 trade-offs between area and performance because the
                 interconnects needed to group cores into clusters incur
                 area overhead, but many applications can make good use
                 of them due to synergies from cache sharing. Combining
                 power, area, and timing results of McPAT with
                 performance simulation of PARSEC benchmarks for
                 manycore designs at the 22nm technology shows that
                 8-core clustering gives the best energy-delay product,
                 whereas when die area is taken into account, 4-core
                 clustering gives the best EDA2P and EDAP.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kritikakou:2013:NOM,
  author =       "Angeliki Kritikakou and Francky Catthoor and George S.
                 Athanasiou and Vasilios Kelefouras and Costas Goutis",
  title =        "Near-Optimal Microprocessor and Accelerators Codesign
                 with Latency and Throughput Constraints",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2459316.2459317",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A systematic methodology for near-optimal
                 software/hardware codesign mapping onto an FPGA
                 platform with microprocessor and HW accelerators is
                 proposed. The mapping steps deal with the
                 inter-organization, the foreground memory management,
                 and the datapath mapping. A step is described by
                 parameters and equations combined in a scalable
                 template. Mapping decisions are propagated as design
                 constraints to prune suboptimal options in next steps.
                 Several performance-area Pareto points are produced by
                 instantiating the parameters. To evaluate our
                 methodology we map a real-time bio-imaging application
                 and loop-dominated benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jiang:2013:HAC,
  author =       "Lei Jiang and Yu Du and Bo Zhao and Youtao Zhang and
                 Bruce R. Childers and Jun Yang",
  title =        "Hardware-Assisted Cooperative Integration of
                 Wear-Leveling and Salvaging for Phase Change Memory",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2459316.2459318",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Phase Change Memory (PCM) has recently emerged as a
                 promising memory technology. However, PCM's limited
                 write endurance restricts its immediate use as a
                 replacement for DRAM. To extend the lifetime of PCM
                 chips, wear-leveling and salvaging techniques have been
                 proposed. Wear-leveling balances write operations
                 across different PCM regions while salvaging extends
                 the duty cycle and provides graceful degradation for a
                 nonnegligible number of failures. Current wear-leveling
                 and salvaging schemes have not been designed and
                 integrated to work cooperatively to achieve the best
                 PCM device lifetime. In particular, a noncontiguous PCM
                 space generated from salvaging complicates
                 wear-leveling and incurs large overhead. In this
                 article, we propose LLS, a Line-Level mapping and
                 Salvaging design. By allocating a dynamic portion of
                 total space in a PCM device as backup space, and
                 mapping failed lines to backup PCM, LLS constructs a
                 contiguous PCM space and masks lower-level failures
                 from the OS and applications. LLS integrates
                 wear-leveling and salvaging and copes well with modern
                 OSes. Our experimental results show that LLS achieves
                 31\% longer lifetime than the state-of-the-art. It has
                 negligible hardware cost and performance overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Han:2013:PEP,
  author =       "Kyuseung Han and Junwhan Ahn and Kiyoung Choi",
  title =        "Power-Efficient Predication Techniques for
                 Acceleration of Control Flow Execution on {CGRA}",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2459316.2459319",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Coarse-grained reconfigurable architecture typically
                 has an array of processing elements which are
                 controlled by a centralized unit. This makes it
                 difficult to execute programs having control divergence
                 among PEs without predication. However, conventional
                 predication techniques have a negative impact on both
                 performance and power consumption due to longer
                 instruction words and unnecessary instruction-fetching
                 decoding nullifying steps. This article reveals
                 performance and power issues in predicated execution
                 which have not been well-addressed yet. Furthermore, it
                 proposes fast and power-efficient predication
                 mechanisms. Experiments conducted through gate-level
                 simulation show that our mechanism improves
                 energy-delay product by 11.9\% to 23.8\% on average.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:MTD,
  author =       "Chao Wang and Xi Li and Junneng Zhang and Xuehai Zhou
                 and Xiaoning Nie",
  title =        "{MP-Tomasulo}: a Dependency-Aware Automatic Parallel
                 Execution Engine for Sequential Programs",
  journal =      j-TACO,
  volume =       "10",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2459316.2459320",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Wed May 1 16:38:16 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article presents MP-Tomasulo, a dependency-aware
                 automatic parallel task execution engine for sequential
                 programs. Applying the instruction-level Tomasulo
                 algorithm to MPSoC environments, MP-Tomasulo detects
                 and eliminates Write-After-Write (WAW) and
                 Write-After-Read (WAR) inter-task dependencies in the
                 dataflow execution, therefore to operate out-of-order
                 task execution on heterogeneous units. We implemented
                 the prototype system within a single FPGA. Experimental
                 results on EEMBC applications demonstrate that
                 MP-Tomasulo can execute the tasks out-of-order to
                 achieve as high as 93.6\% to 97.6\% of ideal peak
                 speedup. A comparative study against a state-of-the-art
                 dataflow execution scheme is illustrated with a classic
                 JPEG application. The promising results show
                 MP-Tomasulo enables programmers to uncover more
                 task-level parallelism on heterogeneous systems, as
                 well as to ease the burden of programmers.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Anonymous:2013:TR,
  author =       "Anonymous",
  title =        "{TACO} Reviewers 2012",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2509420.2509421",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shifer:2013:LLA,
  author =       "Eran Shifer and Shlomo Weiss",
  title =        "Low-latency adaptive mode transitions and hierarchical
                 power management in asymmetric clustered cores",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499901",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recently, engineering solutions that include
                 asymmetric multicores have been fabricated for low
                 form-factor computing devices, indicating a potential
                 direction for future evolution of processors. In this
                 article we propose an asymmetric clustered core
                 architecture, exhibiting low-latency switching between
                 modes relative to asymmetric multicores, and having
                 similarities with the same asymmetric multicore
                 architecture in the context of a wider dynamic range of
                 the processor power-performance characteristic.
                 Asymmetric clustered cores incur additional
                 microarchitectural complexity and area cost inside a
                 core but exhibit better chip-level integration
                 characteristics compared to asymmetric multicores.
                 Focusing on power efficiency of asymmetric clustered
                 cores, we describe: (1) a hierarchical power management
                 partitioning between the operating system and on-die
                 firmware for coarse-grain switch policies, and (2)
                 core-internal tracking hardware for fine-grain
                 switching. The mode switch policies of the core's
                 tracking hardware are dependent on higher-level
                 directives and hints from the operating system, on-die
                 firmware, and compiler or profiling software. We
                 further explore the potential power management benefits
                 of asymmetric clustered cores relative to asymmetric
                 multicores, demonstrating that the ability of
                 asymmetric clustered cores to use tight training
                 periods for adaptive behavior, with low overhead
                 switching between modes, results in a more efficient
                 utilization of power management directives.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Asher:2013:HTL,
  author =       "Yosi Ben Asher and Nadav Rotem",
  title =        "Hybrid type legalization for a sparse {SIMD}
                 instruction set",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2509420.2509422",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "SIMD vector units implement only a subset of the
                 operations used by vectorizing compilers, and there are
                 multiple conflicting techniques to legalize arbitrary
                 vector types into register-sized data types.
                 Traditionally, type legalization is performed using a
                 set of predefined rules, regardless of the operations
                 used in the program. This method is not suitable to
                 sparse SIMD instruction sets and often prevents the
                 vectorization of programs. In this work we introduce a
                 new technique for type legalization, namely vector
                 element promotion, as well as a hybrid method for
                 combining multiple techniques of type legalization. Our
                 hybrid type legalization method makes decisions based
                 on the knowledge of the available instruction set as
                 well as the operations used in the program. Our
                 experimental results demonstrate that program-dependent
                 hybrid type legalization improves the execution time of
                 vector programs, outperforms the existing legalization
                 method, and allows the vectorization of workloads which
                 were not vectorized before.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lei:2013:VCI,
  author =       "Yuanwu Lei and Yong Dou and Lei Guo and Jinbo Xu and
                 Jie Zhou and Yazhuo Dong and Hongjian Li",
  title =        "{VLIW} coprocessor for {IEEE-754} quadruple-precision
                 elementary functions",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512430",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, a unified VLIW coprocessor, based on
                 a common group of atomic operation units, for Quad
                 arithmetic and elementary functions (QP\_VELP) is
                 presented. The explicitly parallel scheme of VLIW
                 instruction and Estrin's evaluation scheme for
                 polynomials are used to improve the performance. A
                 two-level VLIW instruction RAM scheme is introduced to
                 achieve high scalability and customizability, even for
                 more complex key program kernels. Finally, the Quad
                 arithmetic accelerator (QAA) with the QP\_VELP array is
                 implemented on ASIC. Compared with hyper-thread
                 software implementation on an Intel Xeon E5620, QAA
                 with 8 QP\_VELP units achieves improvement by a factor
                 of 18X.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kawahito:2013:IRF,
  author =       "Motohiro Kawahito and Hideaki Komatsu and Takao
                 Moriyama and Hiroshi Inoue and Toshio Nakatani",
  title =        "Idiom recognition framework using topological
                 embedding",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512431",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Modern processors support hardware-assist instructions
                 (such as TRT and TROT instructions on the IBM System z)
                 to accelerate certain functions such as delimiter
                 search and character conversion. Such special
                 instructions are often used in high-performance
                 libraries, but their exploitation in optimizing
                 compilers has been limited. We devised a new idiom
                 recognition technique based on a topological embedding
                 algorithm to detect idiom patterns in the input
                 programs more aggressively than in previous approaches
                 using exact pattern matching. Our approach can detect a
                 pattern even if the code segment does not exactly match
                 the idiom. For example, we can detect a code segment
                 that includes additional code within the idiom pattern.
                 We also propose an instruction simplification for the
                 idiom recognition. This optimization analyzes all of
                 the usages of the output of the optimized code for a
                 specific idiom. If we find that we do not need an
                 actual value for the output but only a value in a
                 subrange, then we can assign a value in that subrange
                 as the output. The code generation can generate faster
                 code with this optimization. We implemented our new
                 idiom recognition approach based on the Java
                 Just-In-Time (JIT) compiler that is part of the J9 Java
                 Virtual Machine, and we supported several important
                 idioms for the special hardware-assist instructions on
                 the IBM System z and on some models of the IBM System
                 p. To demonstrate the effectiveness of our technique,
                 we performed two experiments. The first experiment was
                 to see how many more patterns we can detect compared to
                 the previous approach. The second experiment measured
                 the performance improvements over the previous
                 approaches. For the first experiment, we used the Java
                 Compatibility Kit (JCK) API tests. For the second
                 experiment we used the IBM XML parser, SPECjvm98, and
                 SPCjbb2000. In summary, relative to a baseline
                 implementation using exact pattern matching, our
                 algorithm converted 76\% more loops in JCK tests. On a
                 z9, we also observed significant average performance
                 improvement of the XML parser by 54\%, of SPECjvm98 by
                 1.9\%, and of SPECjbb2000 by 4.4\%. Finally, we
                 observed that the JIT compilation time increased by
                 only 0.32\% to 0.44\%.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shobaki:2013:PIS,
  author =       "Ghassan Shobaki and Maxim Shawabkeh and Najm Eldeen
                 Abu Rmaileh",
  title =        "Preallocation instruction scheduling with register
                 pressure minimization using a combinatorial
                 optimization approach",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512432",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Balancing Instruction-Level Parallelism (ILP) and
                 register pressure during preallocation instruction
                 scheduling is a fundamentally important problem in code
                 generation and optimization. The problem is known to be
                 NP-complete. Many heuristic techniques have been
                 proposed to solve this problem. However, due to the
                 inherently conflicting requirements of maximizing ILP
                 and minimizing register pressure, heuristic techniques
                 may produce poor schedules in many cases. If such cases
                 occur in hot code, significant performance degradation
                 may result. A few combinatorial optimization approaches
                 have also been proposed, but none of them has been
                 shown to solve large real-world instances within
                 reasonable time. This article presents the first
                 combinatorial algorithm that is efficient enough to
                 optimally solve large instances of this problem (basic
                 blocks with hundreds of instructions) within a few
                 seconds per instance. The proposed algorithm uses
                 branch-and-bound enumeration with a number of powerful
                 pruning techniques to efficiently search the solution
                 space. The search is based on a cost function that
                 incorporates schedule length and register pressure. An
                 implementation of the proposed scheduling algorithm has
                 been integrated into the LLVM Compiler and evaluated
                 using SPEC CPU 2006. On x86-64, with a time limit of
                 10ms per instruction, it optimally schedules 79\% of
                 the hot basic blocks in FP2006. Another 19\% of the
                 blocks are not optimally scheduled but are improved in
                 cost relative to LLVM's heuristic. This improves the
                 execution time of some benchmarks by up to 21\%, with a
                 geometric-mean improvement of 2.4\% across the entire
                 benchmark suite. With the use of precise latency
                 information, the geometric-mean improvement is
                 increased to 2.8\%.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{She:2013:EEM,
  author =       "Dongrui She and Yifan He and Henk Corporaal",
  title =        "An energy-efficient method of supporting flexible
                 special instructions in an embedded processor with
                 compact {ISA}",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2509420.2509426",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In application-specific processor design, a common
                 approach to improve performance and efficiency is to
                 use special instructions that execute complex operation
                 patterns. However, in a generic embedded processor with
                 compact Instruction Set Architecture (ISA), these
                 special instructions may lead to large overhead such
                 as: ( i ) more bits are needed to encode the extra
                 opcodes and operands, resulting in wider instructions;
                 ( ii ) more Register File (RF) ports are required to
                 provide the extra operands to the function units. Such
                 overhead may increase energy consumption considerably.
                 In this article, we propose to support flexible
                 operation pair patterns in a processor with a compact
                 24-bit RISC-like ISA using: ( i ) a partially
                 reconfigurable decoder that exploits the pattern
                 locality to reduce opcode space requirement; ( ii ) a
                 software-controlled bypass network to reduce operand
                 encoding bit and RF port requirement. An energy-aware
                 compiler backend is designed for the proposed
                 architecture that performs pattern selection and
                 bypass-aware scheduling to generate energy-efficient
                 codes. Though the proposed design imposes extra
                 constraints on the operation patterns, the experimental
                 results show that for benchmark applications from
                 different domains, the average dynamic instruction
                 count is reduced by over 25\%, which is only about 2\%
                 less than the architecture without such constraints.
                 The proposed architecture reduces total energy by an
                 average of 15.8\% compared to the RISC baseline, while
                 the one without constraints achieves almost no
                 improvement due to its high overhead. When high
                 performance is required, the proposed architecture is
                 able to achieve a speedup of 13.8\% with 13.1\% energy
                 reduction compared to the baseline by introducing
                 multicycle SFU operations.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nandivada:2013:IBA,
  author =       "V. Krishna Nandivada and Rajkishore Barik",
  title =        "Improved bitwidth-aware variable packing",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2509420.2509427",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Bitwidth-aware register allocation has caught the
                 attention of researchers aiming to effectively reduce
                 the number of variables spilled into memory. For
                 general-purpose processors, this improves the execution
                 time performance and reduces runtime memory
                 requirements (which in turn helps in the compilation of
                 programs targeted to systems with constrained memory).
                 Additionally, bitwidth-aware register allocation has
                 been effective in reducing power consumption in
                 embedded processors. One of the key components of
                 bitwidth-aware register allocation is the variable
                 packing algorithm that packs multiple narrow-width
                 variables into one physical register. Tallam and Gupta
                 [2003] have proved that optimal variable packing is an
                 NP-complete problem for arbitrary-width variables and
                 have proposed an approximate solution. In this article,
                 we analyze the complexity of the variable packing
                 problem and present three enhancements that improve the
                 overall packing of variables. In particular, the
                 improvements we describe are: (a) Width Static Single
                 Assignment (W-SSA) form representation that splits the
                 live range of a variable into several fixed-width live
                 ranges (W-SSA) variables); (b) PoTR Representation ---
                 use of powers-of-two representation for bitwidth
                 information for W-SSA variables. Our empirical results
                 have shown that the associated bit wastage resulting
                 from the overapproximation of the widths of variables
                 to the nearest next power of two is a small fraction
                 compared to the total number of bits in use ($ \approx
                 $ 13\%). The main advantage of this representation is
                 that it leads to optimal variable packing in polynomial
                 time; (c) Combined Packing and Coalescing --- we
                 discuss the importance of coalescing (combining
                 variables whose live ranges do not interfere) in the
                 context of variable packing and present an iterative
                 algorithm to perform coalescing and packing of W-SSA
                 variables represented in PoTR. Our experimental results
                 show up to 76.00\% decrease in the number of variables
                 compared to the number of variables in the input
                 program in Single Static Assignment (SSA) form. This
                 reduction in the number of variables led to a
                 significant reduction in dynamic spilling, packing, and
                 unpacking instructions.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ahn:2013:SHR,
  author =       "Jung Ho Ahn and Young Hoon Son and John Kim",
  title =        "Scalable high-radix router microarchitecture using a
                 network switch organization",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512433",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As the system size of supercomputers and datacenters
                 increases, cost-efficient networks become critical in
                 achieving good scalability on those systems. High
                 -radix routers reduce network cost by lowering the
                 network diameter while providing a high bisection
                 bandwidth and path diversity. The building blocks of
                 these large-scale networks are the routers or the
                 switches and they need to scale accordingly to the
                 increasing port count and increasing pin bandwidth.
                 However, as the port count increases, the high-radix
                 router microarchitecture itself needs to scale
                 efficiently. Hierarchical crossbar switch organization
                 has been proposed where a single large crossbar used
                 for a router switch is partitioned into many small
                 crossbars and overcomes the limitations of conventional
                 router microarchitecture. Although the organization
                 provides high performance, it has limited scalability
                 due to excessive power and area overheads by the wires
                 and intermediate buffers. In this article, we propose
                 scalable router microarchitectures that leverage a
                 network within the switch design of the high-radix
                 routers themselves. These alternative designs lower the
                 wiring complexity and buffer requirements. For example,
                 when a folded-Clos switch is used instead of the
                 hierarchical crossbar switch for a radix-64 router, it
                 provides up to 73\%, 58\%, and 87\% reduction in area,
                 energy-delay product, and energy-delay-area product,
                 respectively. We also explore more efficient switch
                 designs by exploiting the traffic-pattern
                 characteristics of the global network and its impact on
                 the local network design within the switch for both
                 folded-Clos and flattened butterfly networks. In
                 particular, we propose a bilateral butterfly switch
                 organization that has fewer crossbars and global wires
                 compared to the topology-agnostic folded-Clos switch
                 while achieving better low-load latency and equivalent
                 saturation throughput.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Huang:2013:ACM,
  author =       "Libo Huang and Zhiying Wang and Nong Xiao and Yongwen
                 Wang and Qiang Dou",
  title =        "Adaptive communication mechanism for accelerating
                 {MPI} functions in {NoC}-based multicore processors",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512434",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multicore designs have emerged as the dominant
                 organization for future high-performance
                 microprocessors. Communication in such designs is often
                 enabled by Networks-on-Chip (NoCs). A new trend in such
                 architectures is to fit a Message Passing Interface
                 (MPI) programming model on NoCs to achieve optimal
                 parallel application performance. A key issue in
                 designing MPI over NoCs is communication protocol,
                 which has not been explored in previous research. This
                 article advocates a hardware-supported communication
                 mechanism using a protocol-adaptive approach to adjust
                 to varying NoC configurations (e.g., number of buffers)
                 and workload behavior (e.g., number of messages). We
                 propose the ADaptive Communication Mechanism (ADCM), a
                 hybrid protocol that involves behavior similar to
                 buffered communication when sufficient buffer is
                 available in the receiver to that similar to a
                 synchronous protocol when buffers in the receiver are
                 limited. ADCM adapts dynamically by deciding
                 communication protocol on a per-request basis using a
                 local estimate of recent buffer utilization. ADCM
                 attempts to combine both the advantages of buffered and
                 synchronous communication modes to achieve enhanced
                 throughput and performance. Simulations of various
                 workloads show that the proposed communication
                 mechanism can be effectively used in future NoC
                 designs.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Malik:2013:OSG,
  author =       "Avinash Malik and David Gregg",
  title =        "Orchestrating stream graphs using model checking",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512435",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article we use model checking to statically
                 distribute and schedule Synchronous DataFlow (SDF)
                 graphs on heterogeneous execution architectures. We
                 show that model checking is capable of providing an
                 optimal solution and it arrives at these solutions
                 faster (in terms of algorithm runtime) than equivalent
                 ILP formulations. Furthermore, we also show how
                 different types of optimizations such as task
                 parallelism, data parallelism, and state sharing can be
                 included within our framework. Finally, comparison of
                 our approach with the current state-of-the-art
                 heuristic techniques show the pitfalls of these
                 techniques and gives a glimpse of how these heuristic
                 techniques can be improved.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:UML,
  author =       "Zheng Wang and Michael F. P. O'boyle",
  title =        "Using machine learning to partition streaming
                 programs",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512436",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Stream-based parallel languages are a popular way to
                 express parallelism in modern applications. The
                 efficient mapping of streaming parallelism to today's
                 multicore systems is, however, highly dependent on the
                 program and underlying architecture. We address this by
                 developing a portable and automatic compiler-based
                 approach to partitioning streaming programs using
                 machine learning. Our technique predicts the ideal
                 partition structure for a given streaming application
                 using prior knowledge learned offline. Using the
                 predictor we rapidly search the program space (without
                 executing any code) to generate and select a good
                 partition. We applied this technique to standard
                 StreamIt applications and compared against existing
                 approaches. On a 4-core platform, our approach achieves
                 60\% of the best performance found by iteratively
                 compiling and executing over 3000 different partitions
                 per program. We obtain, on average, a 1.90$ \times $
                 speedup over the already tuned partitioning scheme of
                 the StreamIt compiler. When compared against a
                 state-of-the-art analytical, model-based approach, we
                 achieve, on average, a 1.77$ \times $ performance
                 improvement. By porting our approach to an 8-core
                 platform, we are able to obtain 1.8$ \times $
                 improvement over the StreamIt default scheme,
                 demonstrating the portability of our approach.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bakhoda:2013:DCN,
  author =       "Ali Bakhoda and John Kim and Tor M. Aamodt",
  title =        "Designing on-chip networks for throughput
                 accelerators",
  journal =      j-TACO,
  volume =       "10",
  number =       "3",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2512429",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Sep 16 17:20:12 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As the number of cores and threads in throughput
                 accelerators such as Graphics Processing Units (GPU)
                 increases, so does the importance of on-chip
                 interconnection network design. This article explores
                 throughput-effective Network-on-Chips (NoC) for future
                 compute accelerators that employ Bulk-Synchronous
                 Parallel (BSP) programming models such as CUDA and
                 OpenCL. A hardware optimization is ``throughput
                 effective'' if it improves parallel application-level
                 performance per unit chip area. We evaluate performance
                 of future looking workloads using detailed closed-loop
                 simulations modeling compute nodes, NoC, and the DRAM
                 memory system. We start from a mesh design with
                 bisection bandwidth balanced to off-chip demand.
                 Accelerator workloads tend to demand high off-chip
                 memory bandwidth which results in a many-to-few traffic
                 pattern when coupled with expected technology
                 constraints of slow growth in pins-per-chip. Leveraging
                 these observations we reduce NoC area by proposing a
                 ``checkerboard'' NoC which alternates between
                 conventional full routers and half routers with limited
                 connectivity. Next, we show that increasing network
                 terminal bandwidth at the nodes connected to DRAM
                 controllers alleviates a significant fraction of the
                 remaining imbalance resulting from the many-to-few
                 traffic pattern. Furthermore, we propose a ``double
                 checkerboard inverted'' NoC organization which takes
                 advantage of channel slicing to reduce area while
                 maintaining the performance improvements of the
                 aforementioned techniques. This organization also has a
                 simpler routing mechanism and improves average
                 application throughput per unit area by 24.3\%.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jantz:2013:ESM,
  author =       "Michael R. Jantz and Prasad A. Kulkarni",
  title =        "Exploring single and multilevel {JIT} compilation
                 policy for modern machines 1",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541229",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Dynamic or Just-in-Time (JIT) compilation is essential
                 to achieve high-performance emulation for programs
                 written in managed languages, such as Java and C\#. It
                 has been observed that a conservative JIT compilation
                 policy is most effective to obtain good runtime
                 performance without impeding application progress on
                 single-core machines. At the same time, it is often
                 suggested that a more aggressive dynamic compilation
                 strategy may perform best on modern machines that
                 provide abundant computing resources, especially with
                 virtual machines (VMs) that are also capable of
                 spawning multiple concurrent compiler threads. However,
                 comprehensive research on the best JIT compilation
                 policy for such modern processors and VMs is currently
                 lacking. The goal of this work is to explore the
                 properties of single-tier and multitier JIT compilation
                 policies that can enable existing and future VMs to
                 realize the best program performance on modern
                 machines. In this work, we design novel experiments and
                 implement new VM configurations to effectively control
                 the compiler aggressiveness and optimization levels (
                 if and when methods are compiled) in the
                 industry-standard Oracle HotSpot Java VM to achieve
                 this goal. We find that the best JIT compilation policy
                 is determined by the nature of the application and the
                 speed and effectiveness of the dynamic compilers. We
                 extend earlier results showing the suitability of
                 conservative JIT compilation on single-core machines
                 for VMs with multiple concurrent compiler threads. We
                 show that employing the free compilation resources
                 (compiler threads and hardware cores) to aggressively
                 compile more program methods quickly reaches a point of
                 diminishing returns. At the same time, we also find
                 that using the free resources to reduce compiler queue
                 backup (compile selected hot methods early )
                 significantly benefits program performance, especially
                 for slower (highly optimizing) JIT compilers. For such
                 compilers, we observe that accurately prioritizing JIT
                 method compiles is crucial to realize the most
                 performance benefit with the smallest hardware budget.
                 Finally, we show that a tiered compilation policy,
                 although complex to implement, greatly alleviates the
                 impact of more and early JIT compilation of programs on
                 modern machines.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dong:2013:CAC,
  author =       "Xiangyu Dong and Norman P. Jouppi and Yuan Xie",
  title =        "A circuit-architecture co-optimization framework for
                 exploring nonvolatile memory hierarchies",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "23:1--23:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541230",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many new memory technologies are available for
                 building future energy-efficient memory hierarchies. It
                 is necessary to have a framework that can quickly find
                 the optimal memory technology at each hierarchy level.
                 In this work, we first build a circuit-architecture
                 joint design space exploration framework by combining
                 RC circuit analysis and Artificial Neural Network
                 (ANN)-based performance modeling. Then, we use this
                 framework to evaluate some emerging nonvolatile memory
                 hierarchies. We demonstrate that a Resistive RAM
                 (ReRAM)-based cache hierarchy on an 8-core
                 Chip-Multiprocessor (CMP) system can achieve a 24\%
                 Energy Delay Product (EDP) improvement and a 36\%
                 Energy Delay Area Product (EDAP) improvement compared
                 to a conventional hierarchy with SRAM on-chip caches
                 and DRAM main memory.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2013:OGE,
  author =       "Jishen Zhao and Guangyu Sun and Gabriel H. Loh and
                 Yuan Xie",
  title =        "Optimizing {GPU} energy efficiency with {$3$D}
                 die-stacking graphics memory and reconfigurable memory
                 interface",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "24:1--24:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541231",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The performance of graphics processing unit (GPU)
                 systems is improving rapidly to accommodate the
                 increasing demands of graphics and high-performance
                 computing applications. With such a performance
                 improvement, however, power consumption of GPU systems
                 is dramatically increased. Up to 30\% of the total
                 power of a GPU system is consumed by the graphic memory
                 itself. Therefore, reducing graphics memory power
                 consumption is critical to mitigate the power
                 challenge. In this article, we propose an
                 energy-efficient reconfigurable 3D die-stacking
                 graphics memory design that integrates wide-interface
                 graphics DRAMs side-by-side with a GPU processor on a
                 silicon interposer. The proposed architecture is a
                 ``3D+2.5D'' system, where the DRAM memory itself is 3D
                 stacked memory with through-silicon via (TSV), whereas
                 the integration of DRAM and the GPU processor is
                 through the interposer solution (2.5D). Since GPU
                 computing units, memory controllers, and memory are all
                 integrated in the same package, the number of memory
                 I/Os is no longer constrained by the package's pin
                 count. We can reduce the memory power consumption by
                 scaling down the supply voltage and frequency of memory
                 interface while maintaining the same or even higher
                 peak memory bandwidth. In addition, we design a
                 reconfigurable memory interface that can dynamically
                 adapt to the requirements of various applications. We
                 propose two reconfiguration mechanisms to optimize the
                 GPU system energy efficiency and throughput,
                 respectively, and thus benefit both memory-intensive
                 and compute-intensive applications. The experimental
                 results show that the proposed GPU memory architecture
                 can effectively improve GPU system energy efficiency by
                 21\%, without reconfiguration. The reconfigurable
                 memory interface can further improve the system energy
                 efficiency by 26\%, and system throughput by 31\% under
                 a capped system power budget of 240W.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:EMT,
  author =       "Chien-Chi Chen and Sheng-De Wang",
  title =        "An efficient multicharacter transition string-matching
                 engine based on the {Aho--Corasick} algorithm",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "25:1--25:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541232",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/string-matching.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "A string-matching engine capable of inspecting
                 multiple characters in parallel can multiply the
                 throughput. However, the space required for
                 implementing a matching engine that can process
                 multiple characters in parallel generally grows
                 exponentially with respect to the characters to be
                 processed in parallel. Based on the Aho--Corasick
                 algorithm (AC-algorithm), this work presents a novel
                 multicharacter transition Nondeterministic Finite
                 Automaton (NFA) approach, called multicharacter AC-NFA,
                 to allow for the inspection of multiple characters in
                 parallel. This approach first converts an AC-trie to an
                 AC-NFA by allowing for the simultaneous activation of
                 multiple states and then converts the AC-NFA to a
                 $k$-character AC-NFA by an algorithm with concatenation
                 operations and assistant transitions. Additionally, the
                 alignment problem, which occurs while multiple
                 characters are being inspected in parallel, is solved
                 using assistant transitions. Moreover, a corresponding
                 output is provided for each inspected character by
                 introducing priority multiplexers to determine the
                 final matching outputs during implementation of the
                 multicharacter AC-NFA. Consequently, the number of
                 derived $k$-character transitions grows linearly with
                 respect to the number $k$. Furthermore, the derived
                 multicharacter AC-NFA is implemented on FPGAs for
                 evaluation. The resulting throughput grows
                 approximately 14 times and the hardware cost grows
                 about 18 times for 16-character AC-NFA implementation,
                 as compared with that for 1-character AC-NFA
                 implementation. The achievable throughput is 21.4Gbps
                 for the 16-character AC-NFA implementation operating at
                 a 167.36MHz clock.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Luo:2013:DIH,
  author =       "Yangchun Luo and Wei-Chung Hsu and Antonia Zhai",
  title =        "The design and implementation of heterogeneous
                 multicore systems for energy-efficient speculative
                 thread execution",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "26:1--26:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541233",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the emergence of multicore processors, various
                 aggressive execution models have been proposed to
                 exploit fine-grained thread-level parallelism, taking
                 advantage of the fast on-chip interconnection
                 communication. However, the aggressive nature of these
                 execution models often leads to excessive energy
                 consumption incommensurate to execution time reduction.
                 In the context of Thread-Level Speculation, we
                 demonstrated that on a same-ISA heterogeneous multicore
                 system, by dynamically deciding how on-chip resources
                 are utilized, speculative threads can achieve
                 performance gain in an energy-efficient way. Through a
                 systematic design space exploration, we built a
                 multicore architecture that integrates heterogeneous
                 components of processing cores and first-level caches.
                 To cope with processor reconfiguration overheads, we
                 introduced runtime mechanisms to mitigate their
                 impacts. To match program execution with the most
                 energy-efficient processor configuration, the system
                 was equipped with a dynamic resource allocation scheme
                 that characterizes program behaviors using novel
                 processor counters. We evaluated the proposed
                 heterogeneous system with a diverse set of benchmark
                 programs from SPEC CPU2000 and CPU20006 suites.
                 Compared to the most efficient homogeneous TLS
                 implementation, we achieved similar performance but
                 consumed 18\% less energy. Compared to the most
                 efficient homogeneous uniprocessor running sequential
                 programs, we improved performance by 29\% and reduced
                 energy consumption by 3.6\%, which is a 42\%
                 improvement in energy-delay-squared product.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Rolan:2013:VSC,
  author =       "Dyer Rol{\'a}n and Basilio B. Fraguela and Ram{\'o}n
                 Doallo",
  title =        "Virtually split cache: an efficient mechanism to
                 distribute instructions and data 1",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "27:1--27:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541234",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "First-level caches are usually split for both
                 instructions and data instead of unifying them in a
                 single cache. Although that approach eases the pipeline
                 design and provides a simple way to independently treat
                 data and instructions, its global hit rate is usually
                 smaller than that of a unified cache. Furthermore,
                 unified lower-level caches usually behave and process
                 memory requests disregarding whether they are data or
                 instruction requests. In this article, we propose a new
                 technique aimed to balance the amount of space devoted
                 to instructions and data for optimizing set-associative
                 caches: the Virtually Split Cache or VSC. Our technique
                 combines the sharing of resources from unified
                 approaches with the bandwidth and parallelism that
                 split configurations provide, thus reducing power
                 consumption while not degrading performance. Our design
                 dynamically adjusts cache resources devoted to
                 instructions and data depending on their particular
                 demand. Two VSC designs are proposed in order to track
                 the instructions and data requirements. The Shadow Tag
                 VSC (ST-VSC) is based on shadow tags that store the
                 last evicted line related to data and instructions in
                 order to determine how well the cache would work with
                 one more way per set devoted to each kind. The Global
                 Selector VSC (GS-VSC) uses a saturation counter that is
                 updated every time a cache miss occurs either under an
                 instruction or data request applying a duel-like
                 mechanism. Experiments with a variable and a fixed
                 latency VSC show that ST-VSC and GS-VSC reduce on
                 average the cache hierarchy power consumption by 29\%
                 and 24\%, respectively, with respect to a standard
                 baseline. As for performance, while the fixed latency
                 designs virtually match the split baseline in a
                 single-core system, a variable latency ST-VSC and
                 GS-VSC increase the average IPC by 2.5\% and 2\%,
                 respectively. In multicore systems, even the slower
                 fixed latency ST-VSC and GS-VSC designs improve the
                 baseline IPC by 3.1\% and 2.5\%, respectively, in a
                 four-core system thanks to the reduction in the
                 bandwidth demanded from the lower cache levels. This is
                 in contrast with many techniques that trade performance
                 degradation for power consumption reduction. VSC
                 particularly benefits embedded processors with a single
                 level of cache, where up to an average 9.2\% IPC
                 improvement is achieved. Interestingly, we also find
                 that partitioning the LLC for instructions and data can
                 improve performance around 2\%.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Subramaniam:2013:UFC,
  author =       "Samantika Subramaniam and Simon C. Steely and Will
                 Hasenplaugh and Aamer Jaleel and Carl Beckmann and
                 Tryggve Fossum and Joel Emer",
  title =        "Using in-flight chains to build a scalable cache
                 coherence protocol",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "28:1--28:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541235",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "As microprocessor designs integrate more cores,
                 scalability of cache coherence protocols becomes a
                 challenging problem. Most directory-based protocols
                 avoid races by using blocking tag directories that can
                 impact the performance of parallel applications. In
                 this article, we first quantitatively demonstrate that
                 state-of-the-art blocking protocols significantly
                 constrain throughput at large core counts for several
                 parallel applications. Nonblocking protocols address
                 this throughput concern at the expense of scalability
                 in the interconnection network or in the required
                 resource overheads. To address this concern, we enhance
                 nonblocking directory protocols by migrating the point
                 of service of responses. Our approach uses in-flight
                 chains of cores making parallel memory requests to
                 incorporate scalability while maintaining
                 high-throughput. The proposed cache coherence protocol
                 called chained cache coherence, can outperform blocking
                 protocols by up to 20\% on scientific and 12\% on
                 commercial applications. It also has low resource
                 overheads and simple address ordering requirements
                 making it both a high-performance and scalable
                 protocol. Furthermore, in-flight chains provide a
                 scalable solution to building hierarchical and
                 nonblocking tag directories as well as optimize
                 communication latencies.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Sanchez:2013:MIP,
  author =       "Daniel S{\'a}nchez and Yiannakis Sazeides and Juan M.
                 Cebri{\'a}n and Jos{\'e} M. Garc{\'\i}a and Juan L.
                 Arag{\'o}n",
  title =        "Modeling the impact of permanent faults in caches",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "29:1--29:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541236",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The traditional performance cost benefits we have
                 enjoyed for decades from technology scaling are
                 challenged by several critical constraints including
                 reliability. Increases in static and dynamic variations
                 are leading to higher probability of parametric and
                 wear-out failures and are elevating reliability into a
                 prime design constraint. In particular, SRAM cells used
                 to build caches that dominate the processor area are
                 usually minimum sized and more prone to failure. It is
                 therefore of paramount importance to develop effective
                 methodologies that facilitate the exploration of
                 reliability techniques for caches. To this end, we
                 present an analytical model that can determine for a
                 given cache configuration, address trace, and random
                 probability of permanent cell failure the exact
                 expected miss rate and its standard deviation when
                 blocks with faulty bits are disabled. What
                 distinguishes our model is that it is fully analytical,
                 it avoids the use of fault maps, and yet, it is both
                 exact and simpler than previous approaches. The
                 analytical model is used to produce the miss-rate
                 trends ( expected miss-rate ) for future technology
                 nodes for both uncorrelated and clustered faults. Some
                 of the key findings based on the proposed model are (i)
                 block disabling has a negligible impact on the expected
                 miss-rate unless probability of failure is equal or
                 greater than 2.6e-4, (ii) the fault map methodology can
                 accurately calculate the expected miss-rate as long as
                 1,000 to 10,000 fault maps are used, and (iii) the
                 expected miss-rate for execution of parallel
                 applications increases with the number of threads and
                 is more pronounced for a given probability of failure
                 as compared to sequential execution.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2013:APF,
  author =       "Sanghoon Lee and James Tuck",
  title =        "Automatic parallelization of fine-grained
                 metafunctions on a chip multiprocessor",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "30:1--30:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541237",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Due to the importance of reliability and security,
                 prior studies have proposed inlining metafunctions into
                 applications for detecting bugs and security
                 vulnerabilities. However, because these software
                 techniques add frequent, fine-grained instrumentation
                 to programs, they often incur large runtime overheads.
                 In this work, we consider an automatic thread
                 extraction technique for removing these fine-grained
                 checks from a main application and scheduling them on
                 helper threads. In this way, we can leverage the
                 resources available on a CMP to reduce the latency and
                 overhead of fine-grained checking codes. Our
                 parallelization strategy extracts metafunctions from a
                 single threaded application and executes them in
                 customized helper threads-threads constructed to mirror
                 relevant fragments of the main program's behavior in
                 order to keep communication and overhead low. To get
                 good performance, we consider optimizations that reduce
                 communication and balance work among many threads. We
                 evaluate our parallelization strategy on Mudflap, a
                 pointer-use checking tool in GCC. To show the benefits
                 of our technique, we compare it to a manually
                 parallelized version of Mudflap. We run our experiments
                 on an architectural simulator with support for fast
                 queueing operations. On a subset of SPECint 2000, our
                 automatically parallelized code using static load
                 balance is only 19\% slower, on average, than the
                 manually parallelized version on a simulated eight-core
                 system. In addition, our automatically parallelized
                 code using dynamic load balance is competitive, on
                 average, to the manually parallelized version on a
                 simulated eight-core system. Furthermore, all the
                 applications except parser achieve better speedups with
                 our automatic algorithms than with the manual approach.
                 Also, our approach introduces very little overhead in
                 the main program-it is kept under 100\%, which is more
                 than a 5.3$ \times $ reduction compared to serial
                 Mudflap.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dubach:2013:DMA,
  author =       "Christophe Dubach and Timothy M. Jones and Edwin V.
                 Bonilla",
  title =        "Dynamic microarchitectural adaptation using machine
                 learning",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "31:1--31:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541238",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Adaptive microarchitectures are a promising solution
                 for designing high-performance, power-efficient
                 microprocessors. They offer the ability to tailor
                 computational resources to the specific requirements of
                 different programs or program phases. They have the
                 potential to adapt the hardware cost-effectively at
                 runtime to any application's needs. However, one of the
                 key challenges is how to dynamically determine the best
                 architecture configuration at any given time, for any
                 new workload. This article proposes a novel control
                 mechanism based on a predictive model for
                 microarchitectural adaptivity control. This model is
                 able to efficiently control adaptivity by monitoring
                 the behaviour of an application's different phases at
                 runtime. We show that by using this model on SPEC 2000,
                 we double the energy\slash performance efficiency of
                 the processor when compared to the best static
                 configuration tuned for the whole benchmark suite. This
                 represents 74\% of the improvement available if we know
                 the best microarchitecture for each program phase ahead
                 of time. In addition, we present an extended analysis
                 of the best configurations found and show that the
                 overheads associated with the implementation of our
                 scheme have a negligible impact on performance and
                 power.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2013:CME,
  author =       "Long Chen and Yanan Cao and Zhao Zhang",
  title =        "{E$^3$CC}: a memory error protection scheme with novel
                 address mapping for subranked and low-power memories",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "32:1--32:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2541239",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Dec 23 10:31:41 MST 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This study presents and evaluates E$^3$ CC (Enhanced
                 Embedded ECC), a full design and implementation of a
                 generic embedded ECC scheme that enables
                 power-efficient error protection for subranked memory
                 systems. It incorporates a novel address mapping scheme
                 called Biased Chinese Remainder Mapping (BCRM) to
                 resolve the address mapping issue for memories of page
                 interleaving, plus a simple and effective cache design
                 to reduce extra ECC traffic. Our evaluation using SPEC
                 CPU2006 benchmarks confirms the performance and power
                 efficiency of the E$^3$ CC scheme for subranked
                 memories as well as conventional memories.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tian:2013:TBM,
  author =       "Yingying Tian and Samira M. Khan and Daniel A.
                 Jim{\'e}nez",
  title =        "Temporal-based multilevel correlating inclusive cache
                 replacement",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "33:1--33:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555290",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Inclusive caches have been widely used in Chip
                 Multiprocessors (CMPs) to simplify cache coherence.
                 However, they have poor performance compared with
                 noninclusive caches not only because of the limited
                 capacity of the entire cache hierarchy but also due to
                 ignorance of temporal locality of the Last-Level Cache
                 (LLC). Blocks that are highly referenced (referred to
                 as hot blocks ) are always hit in higher-level caches
                 (e.g., L1 cache) and are rarely referenced in the LLC.
                 Therefore, they become replacement victims in the LLC.
                 Due to the inclusion property, blocks evicted from the
                 LLC have to also be invalidated from higher-level
                 caches. Invalidation of hot blocks from the entire
                 cache hierarchy introduces costly off-chip misses that
                 makes the inclusive cache perform poorly. Neither
                 blocks that are highly referenced in the LLC nor blocks
                 that are highly referenced in higher-level caches
                 should be the LLC replacement victims. We propose
                 temporal-based multilevel correlating cache replacement
                 for inclusive caches to evict blocks in the LLC that
                 are also not hot in higher-level caches using
                 correlated temporal information acquired from all
                 levels of a cache hierarchy with minimal overhead.
                 Invalidation of these blocks does not hurt the
                 performance. By contrast, replacing them as early as
                 possible with useful blocks helps improve cache
                 performance. Based on our experiments, in a dual-core
                 CMP, an inclusive cache with temporal-based multilevel
                 correlating cache replacement significantly outperforms
                 an inclusive cache with traditional LRU replacement by
                 yielding an average speedup of 12.7\%, which is
                 comparable to an enhanced noninclusive cache, while
                 requiring less than 1\% of storage overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2013:HSA,
  author =       "Qixiao Liu and Miquel Moreto and Victor Jimenez and
                 Jaume Abella and Francisco J. Cazorla and Mateo
                 Valero",
  title =        "Hardware support for accurate per-task energy metering
                 in multicore systems",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "34:1--34:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555291",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Accurately determining the energy consumed by each
                 task in a system will become of prominent importance in
                 future multicore-based systems because it offers
                 several benefits, including (i) better application
                 energy/performance optimizations, (ii) improved
                 energy-aware task scheduling, and (iii) energy-aware
                 billing in data centers. Unfortunately, existing
                 methods for energy metering in multicores fail to
                 provide accurate energy estimates for each task when
                 several tasks run simultaneously. This article makes a
                 case for accurate Per-Task Energy Metering (PTEM) based
                 on tracking the resource utilization and occupancy of
                 each task. Different hardware implementations with
                 different trade-offs between energy prediction accuracy
                 and hardware-implementation complexity are proposed.
                 Our evaluation shows that the energy consumed in a
                 multicore by each task can be accurately measured. For
                 a 32-core, 2-way, simultaneous multithreaded core
                 setup, PTEM reduces the average accuracy error from
                 more than 12\% when our hardware support is not used to
                 less than 4\% when it is used. The maximum observed
                 error for any task in the workload we used reduces from
                 58\% down to 9\% when our hardware support is used.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Mehta:2013:TSS,
  author =       "Sanyam Mehta and Gautham Beeraka and Pen-Chung Yew",
  title =        "Tile size selection revisited",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555292",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Loop tiling is a widely used loop transformation to
                 enhance data locality and allow data reuse. In the
                 tiled code, however, tiles of different sizes can lead
                 to significant variation in performance. Thus,
                 selection of an optimal tile size is critical to
                 performance of tiled codes. In the past, tile size
                 selection has been attempted using both static
                 analytical and dynamic empirical (auto-tuning) models.
                 Past work using static models assumed a direct-mapped
                 cache for the purpose of analysis and thus proved to be
                 less robust. On the other hand, the auto-tuning models
                 involve an exhaustive search in a large space of tiled
                 codes. In this article, we propose a new analytical
                 model for tile size selection that leverages the high
                 set associativity in modern caches to minimize conflict
                 misses. Our tile size selection model targets data
                 reuse in multiple levels of cache. In addition, it
                 considers the interaction of tiling with the SIMD unit
                 in modern processors in estimating the optimal tile
                 size. We find that these factors, not considered in
                 previous models, are critical in developing a robust
                 model for tile size selection. We implement our tile
                 size selection model in a polyhedral compiler and test
                 it on 12 benchmark kernels using two different problem
                 sizes. Our model outperforms the previous analytical
                 models that are based on reusing data in a single level
                 of cache and achieves an average performance
                 improvement of 9.7\% and 20.4\%, respectively, over the
                 best square (cubic) tiles for the two problem sizes. In
                 addition, the tile size chosen by our tile size
                 selection algorithm is similar to the best performing
                 size obtained through an extensive search, validating
                 the analytical model underlying the algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Prisacari:2013:FPS,
  author =       "Bogdan Prisacari and German Rodriguez and Cyriel
                 Minkenberg and Torsten Hoefler",
  title =        "Fast pattern-specific routing for fat tree networks",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555293",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In the context of eXtended Generalized Fat Tree (XGFT)
                 topologies, widely used in HPC and datacenter network
                 designs, we propose a generic method, based on Integer
                 Linear Programming (ILP), to efficiently determine
                 optimal routes for arbitrary workloads. We propose a
                 novel approach that combines ILP with dynamic
                 programming, effectively reducing the time to solution.
                 Specifically, we divide the network into smaller
                 subdomains optimized using a custom ILP formulation
                 that ensures global optimality of local solutions.
                 Local solutions are then combined into an optimal
                 global solution using dynamic programming. Finally, we
                 demonstrate through a series of extensive benchmarks
                 that our approach scales in practice to networks
                 interconnecting several thousands of nodes, using a
                 single-threaded, freely available linear programming
                 solver on commodity hardware, with the potential for
                 higher scalability by means of commercial, parallel
                 solvers.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Breughe:2013:SRB,
  author =       "Maximilien B. Breughe and Lieven Eeckhout",
  title =        "Selecting representative benchmark inputs for
                 exploring microprocessor design spaces",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "37:1--37:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555294",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The design process of a microprocessor requires
                 representative workloads to steer the search process
                 toward an optimum design point for the target
                 application domain. However, considering a broad set of
                 workloads to cover the large space of potential
                 workloads is infeasible given how time-consuming design
                 space exploration typically is. Hence, it is crucial to
                 select a small yet representative set of workloads,
                 which leads to a shorter design cycle while yielding a
                 (near) optimal design. Prior work has mostly looked
                 into selecting representative benchmarks; however,
                 limited attention was given to the selection of
                 benchmark inputs and how this affects workload
                 representativeness during design space exploration.
                 Using a set of 1,000 inputs for a number of embedded
                 benchmarks and a design space with around 1,700 design
                 points, we find that selecting a single or three random
                 input(s) per benchmark potentially (in a worst-case
                 scenario) leads to a suboptimal design that is 56\% and
                 33\% off, on average, relative to the optimal design in
                 our design space in terms of Energy-Delay Product
                 (EDP). We then propose and evaluate a number of methods
                 for selecting representative inputs and show that we
                 can find the optimum design point with as few as three
                 inputs.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kerschbaumer:2013:IFT,
  author =       "Christoph Kerschbaumer and Eric Hennigan and Per
                 Larsen and Stefan Brunthaler and Michael Franz",
  title =        "Information flow tracking meets just-in-time
                 compilation",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "38:1--38:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555295",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Web applications are vulnerable to cross-site
                 scripting attacks that enable data thefts. Information
                 flow tracking in web browsers can prevent communication
                 of sensitive data to unintended recipients and thereby
                 stop such data thefts. Unfortunately, existing
                 solutions have focused on incorporating information
                 flow into browsers' JavaScript interpreters, rather
                 than just-in-time compilers, rendering the resulting
                 performance noncompetitive. Few users will switch to a
                 safer browser if it comes at the cost of significantly
                 degrading web application performance. We present the
                 first information flow tracking JavaScript engine that
                 is based on a true just-in-time compiler, and that
                 thereby outperforms all previous interpreter-based
                 information flow tracking JavaScript engines by more
                 than a factor of two. Our JIT-based engine (i) has the
                 same coverage as previous interpreter- based solutions,
                 (ii) requires reasonable implementation effort, and
                 (iii) introduces new optimizations to achieve
                 acceptable performance. When evaluated against three
                 industry-standard JavaScript benchmark suites, there is
                 still an average slowdown of 73\% over engines that do
                 not support information flow, but this is now well
                 within the range that many users will find an
                 acceptable price for obtaining substantially increased
                 security.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nasre:2013:TSE,
  author =       "Rupesh Nasre",
  title =        "Time- and space-efficient flow-sensitive points-to
                 analysis",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555296",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compilation of real-world programs often requires
                 hours. The term nightly build known to industrial
                 researchers is an artifact of long compilation times.
                 Our goal is to reduce the absolute analysis times for
                 large C codes (of the order of millions of lines).
                 Pointer analysis is one of the key analyses performed
                 during compilation. Its scalability is paramount to
                 achieve the efficiency of the overall compilation
                 process and its precision directly affects that of the
                 client analyses. In this work, we design a time- and
                 space-efficient flow-sensitive pointer analysis and
                 parallelize it on graphics processing units. Our
                 analysis proposes to use an extended bloom filter,
                 called multibloom, to store points-to information in an
                 approximate manner and develops an analysis in terms of
                 the operations over the multibloom. Since bloom filter
                 is a probabilistic data structure, we develop ways to
                 gain back the analysis precision. We achieve effective
                 parallelization by achieving memory coalescing,
                 reducing thread divergence, and improving load balance
                 across GPU warps. Compared to a state-of-the-art
                 sequential solution, our parallel version achieves a
                 7.8 $ \times $ speedup with less than 5\% precision
                 loss on a suite of six large programs. Using two client
                 transformations, we show that this loss in precision
                 only minimally affects a client's precision.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ruan:2013:BTB,
  author =       "Wenjia Ruan and Yujie Liu and Michael Spear",
  title =        "Boosting timestamp-based transactional memory by
                 exploiting hardware cycle counters",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555297",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Time-based transactional memories typically rely on a
                 shared memory counter to ensure consistency.
                 Unfortunately, such a counter can become a bottleneck.
                 In this article, we identify properties of hardware
                 cycle counters that allow their use in place of a
                 shared memory counter. We then devise algorithms that
                 exploit the x86 cycle counter to enable bottleneck-free
                 transactional memory runtime systems. We also consider
                 the impact of privatization safety and hardware
                 ordering constraints on the correctness, performance,
                 and generality of our algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Dey:2013:RMD,
  author =       "Tanima Dey and Wei Wang and Jack W. Davidson and Mary
                 Lou Soffa",
  title =        "{ReSense}: Mapping dynamic workloads of colocated
                 multithreaded applications using resource sensitivity",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "41:1--41:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555298",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "To utilize the full potential of modern chip
                 multiprocessors and obtain scalable performance
                 improvements, it is critical to mitigate resource
                 contention created by multithreaded workloads. In this
                 article, we describe ReSense, the first runtime system
                 that uses application characteristics to dynamically
                 map multithreaded applications from dynamic
                 workloads-workloads where multithreaded applications
                 arrive, execute, and terminate continuously in
                 unpredictable ways. ReSense mitigates contention for
                 the shared resources in the memory hierarchy by
                 applying a novel thread-mapping algorithm that
                 dynamically adjusts the mapping of threads from dynamic
                 workloads using a precalculated sensitivity score. The
                 sensitivity score quantifies an application's
                 sensitivity to sharing a particular memory resource and
                 is calculated by an efficient characterization process
                 that involves running the multithreaded application by
                 itself on the target platform. To measure ReSense's
                 effectiveness, sensitivity scores were determined for
                 21 benchmarks from PARSEC-2.1 and NPB-OMP-3.3 for the
                 shared resources in the memory hierarchy on four
                 different platforms. Using three different-sized
                 dynamic workloads composed of randomly selected two,
                 four, and eight corunning benchmarks with randomly
                 selected start times, ReSense was able to improve the
                 average response time of the three workloads by up to
                 27.03\%, 20.89\%, and 29.34\% and throughput by up to
                 19.97\%, 46.56\%, and 29.86\%, respectively, over the
                 native OS on real hardware. By estimating and comparing
                 ReSense's effectiveness with the optimal thread mapping
                 for two different workloads, we found that the maximum
                 average difference with the experimentally determined
                 optimal performance was 1.49\% for average response
                 time and 2.08\% for throughput.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Armejach:2013:TIP,
  author =       "Adri{\`a} Armejach and Ruben Titos-Gil and Anurag Negi
                 and Osman S. Unsal and Adri{\'a}n Cristal",
  title =        "Techniques to improve performance in requester-wins
                 hardware transactional memory",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "42:1--42:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555299",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The simplicity of requester-wins Hardware
                 Transactional Memory (HTM) makes it easy to incorporate
                 in existing chip multiprocessors. Hence, such systems
                 are expected to be widely available in the near future.
                 Unfortunately, these implementations are prone to
                 suffer severe performance degradation due to transient
                 and persistent livelock conditions. This article shows
                 that existing techniques are unable to mitigate this
                 degradation effectively. It then proposes and evaluates
                 four novel techniques-two software-based that employ
                 information provided by the hardware and two that
                 require simple core-local hardware additions-which have
                 the potential to boost the performance of
                 requester-wins HTM designs substantially.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jeon:2013:RDR,
  author =       "Myeongjae Jeon and Conglong Li and Alan L. Cox and
                 Scott Rixner",
  title =        "Reducing {DRAM} row activations with eager read\slash
                 write clustering",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "43:1--43:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555300",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes and evaluates a new approach to
                 optimizing DRAM performance and energy consumption that
                 is based on eagerly writing dirty cache lines to DRAM.
                 Under this approach, many dirty cache lines are written
                 to DRAM before they are evicted. In particular, dirty
                 cache lines that have not been recently accessed are
                 eagerly written to DRAM when the corresponding row has
                 been activated by an ordinary, noneager access, such as
                 a read. This approach enables clustering of reads and
                 writes that target the same row, resulting in a
                 significant reduction in row activations. Specifically,
                 for a variety of applications, it reduces the number of
                 DRAM row activations by an average of 42\% and a
                 maximum of 82\%. Moreover, the results from a
                 full-system simulator show compelling performance
                 improvements and energy consumption reductions. Out of
                 23 applications, 6 have overall performance
                 improvements between 10\% and 20\%, and 3 have
                 improvements in excess of 20\%. Furthermore, 12 consume
                 between 10\% and 20\% less DRAM energy, and 7 have
                 energy consumption reductions in excess of 20\%.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Zhao:2013:HPP,
  author =       "Zhijia Zhao and Michael Bebenita and Dave Herman and
                 Jianhua Sun and Xipeng Shen",
  title =        "{HPar}: a practical parallel parser for {HTML} ---
                 taming {HTML} complexities for parallel parsing",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "44:1--44:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555301",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Parallelizing HTML parsing is challenging due to the
                 complexities of HTML documents and the inherent
                 dependencies in its parsing algorithm. As a result,
                 despite numerous studies in parallel parsing, HTML
                 parsing remains sequential today. It forms one of the
                 final barriers for fully parallelizing browser
                 operations to minimize the browser's response time-an
                 important variable for user experiences, especially on
                 portable devices. This article provides a comprehensive
                 analysis on the special complexities of parallel HTML
                 parsing and presents a systematic exploration in
                 overcoming those difficulties through specially
                 designed speculative parallelizations. This work
                 develops, to the best of our knowledge, the first
                 pipelining and data-level parallel HTML parsers. The
                 data-level parallel parser, named HPar, achieves up to
                 2.4$ \times $ speedup on quadcore devices. This work
                 demonstrates the feasibility of efficient, parallel
                 HTML parsing for the first time and offers a set of
                 novel insights for parallel HTML parsing",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Totoni:2013:EFE,
  author =       "Ehsan Totoni and Mert Dikmen and Mar{\'\i}a Jes{\'u}s
                 Garzar{\'a}n",
  title =        "Easy, fast, and energy-efficient object detection on
                 heterogeneous on-chip architectures",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "45:1--45:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555302",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We optimize a visual object detection application
                 (that uses Vision Video Library kernels) and show that
                 OpenCL is a unified programming paradigm that can
                 provide high performance when running on the Ivy Bridge
                 heterogeneous on-chip architecture. We evaluate
                 different mapping techniques and show that running each
                 kernel where it fits the best and using software
                 pipelining can provide 1.91 times higher performance
                 and 42\% better energy efficiency. We also show how to
                 trade accuracy for energy at runtime. Overall, our
                 application can perform accurate object detection at 40
                 frames per second (fps) in an energy-efficient
                 manner.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fedorov:2013:AAL,
  author =       "Viacheslav V. Fedorov and Sheng Qiu and A. L.
                 Narasimha Reddy and Paul V. Gratz",
  title =        "{ARI}: Adaptive {LLC}-memory traffic management",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "46:1--46:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2543697",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Decreasing the traffic from the CPU LLC to main memory
                 is a very important issue in modern systems. Recent
                 work focuses on cache misses, overlooking the impact of
                 writebacks on the total memory traffic, energy
                 consumption, IPC, and so forth. Policies that foster a
                 balanced approach, between reducing write traffic to
                 memory and improving miss rates, can increase overall
                 performance and improve energy efficiency and memory
                 system lifetime for NVM memory technology, such as
                 phase-change memory (PCM). We propose Adaptive
                 Replacement and Insertion (ARI), an adaptive approach
                 to last-level CPU cache management, optimizing the two
                 parameters (miss rate and writeback rate)
                 simultaneously. Our specific focus is to reduce
                 writebacks as much as possible while maintaining or
                 improving the miss rate relative to conventional LRU
                 replacement policy. ARI reduces LLC writebacks by 33\%,
                 on average, while also decreasing misses by 4.7\%, on
                 average. In a typical system, this boosts IPC by 4.9\%,
                 on average, while decreasing energy consumption by
                 8.9\%. These results are achieved with minimal hardware
                 overheads.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gonzalez-Alvarez:2013:AAD,
  author =       "Cecilia Gonz{\'a}lez-{\'A}lvarez and Jennifer B.
                 Sartor and Carlos {\'A}lvarez and Daniel
                 Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout",
  title =        "Accelerating an application domain with specialized
                 functional units",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "47:1--47:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555303",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Hardware specialization has received renewed interest
                 recently as chips are hitting power limits. Chip
                 designers of traditional processor architectures have
                 primarily focused on general-purpose computing,
                 partially due to time-to-market pressure and simpler
                 design processes. But new power limits require some
                 chip specialization. Although hardware configured for a
                 specific application yields large speedups for
                 low-power dissipation, its design is more complex and
                 less reusable. We instead explore domain-based
                 specialization, a scalable approach that balances
                 hardware's reusability and performance efficiency. We
                 focus on specialization using customized compute units
                 that accelerate particular operations. In this article,
                 we develop automatic techniques to identify code
                 sequences from different applications within a domain
                 that can be targeted to a new custom instruction that
                 will be run inside a configurable specialized
                 functional unit (SFU). We demonstrate that using a
                 canonical representation of computations finds more
                 common code sequences among applications that can be
                 mapped to the same custom instruction, leading to
                 larger speedups while specializing a smaller core area
                 than previous pattern-matching techniques. We also
                 propose new heuristics to narrow the search space of
                 domain-specific custom instructions, finding those that
                 achieve the best performance across applications. We
                 estimate the overall performance achieved with our
                 automatic techniques using hardware models on a set of
                 nine media benchmarks, showing that when limiting the
                 core area devoted to specialization, the SFU
                 customization with the largest speedups includes both
                 application- and domain-specific custom instructions.
                 We demonstrate that exploring domain-specific hardware
                 acceleration is key to continued computing system
                 performance improvements.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:RMM,
  author =       "Xiaolin Wang and Lingmei Weng and Zhenlin Wang and
                 Yingwei Luo",
  title =        "Revisiting memory management on virtualized
                 environments",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "48:1--48:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555304",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the evolvement of hardware, 64-bit Central
                 Processing Units (CPUs) and 64-bit Operating Systems
                 (OSs) have dominated the market. This article
                 investigates the performance of virtual memory
                 management of Virtual Machines (VMs) with a large
                 virtual address space in 64-bit OSs, which imposes
                 different pressure on memory virtualization than 32-bit
                 systems. Each of the two conventional memory
                 virtualization approaches, Shadowing Paging (SP) and
                 Hardware-Assisted Paging (HAP), causes different
                 overhead for different applications. Our experiments
                 show that 64-bit applications prefer to run in a VM
                 using SP, while 32-bit applications do not have a
                 uniform preference between SP and HAP. In this article,
                 we trace this inconsistency between 32-bit applications
                 and 64-bit applications to its root cause through a
                 systematic empirical study in Linux systems and
                 discover that the major overhead of SP results from
                 memory management in the 32-bit GNU C library ( glibc
                 ). We propose enhancements to the existing memory
                 management algorithms, which substantially reduce the
                 overhead of SP. Based on the evaluations using SPEC
                 CPU2006, Parsec 2.1, and cloud benchmarks, our results
                 show that SP, with the improved memory allocators, can
                 compete with HAP in almost all cases, in both 64-bit
                 and 32-bit systems. We conclude that without a
                 significant breakthrough in HAP, researchers should pay
                 more attention to SP, which is more flexible and cost
                 effective.",
  acknowledgement = ack-nhfb,
  articleno =    "48",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jiang:2013:PAP,
  author =       "Chuntao Jiang and Zhibin Yu and Hai Jin and Chengzhong
                 Xu and Lieven Eeckhout and Wim Heirman and Trevor E.
                 Carlson and Xiaofei Liao",
  title =        "{PCantorSim}: Accelerating parallel architecture
                 simulation through fractal-based sampling",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "49:1--49:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555305",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Computer architects rely heavily on microarchitecture
                 simulation to evaluate design alternatives.
                 Unfortunately, cycle-accurate simulation is extremely
                 slow, being at least 4 to 6 orders of magnitude slower
                 than real hardware. This longstanding problem is
                 further exacerbated in the multi-/many-core era,
                 because single-threaded simulation performance has not
                 improved much, while the design space has expanded
                 substantially. Parallel simulation is a promising
                 approach, yet does not completely solve the simulation
                 challenge. Furthermore, existing sampling techniques,
                 which are widely used for single-threaded applications,
                 do not readily apply to multithreaded applications as
                 thread interaction and synchronization must now be
                 taken into account. This work presents PCantorSim, a
                 novel Cantor set (a classic fractal)--based sampling
                 scheme to accelerate parallel simulation of
                 multithreaded applications. Through the use of the
                 proposed methodology, only less than 5\% of an
                 application's execution time is simulated in detail. We
                 have implemented our approach in Sniper (a parallel
                 multicore simulator) and evaluated it by running the
                 PARSEC benchmarks on a simulated 8-core system. The
                 results show that PCantorSim increases simulation speed
                 over detailed parallel simulation by a factor of 20$
                 \times $, on average, with an average absolute
                 execution time prediction error of 5.3\%.",
  acknowledgement = ack-nhfb,
  articleno =    "49",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Stipic:2013:PGT,
  author =       "Srdan Stipi{\'c} and Vesna Smiljkovi{\'c} and Osman
                 Unsal and Adri{\'a}n Cristal and Mateo Valero",
  title =        "Profile-guided transaction coalescing-lowering
                 transactional overheads by merging transactions",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "50:1--50:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555306",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/hash.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Previous studies in software transactional memory
                 mostly focused on reducing the overhead of
                 transactional read and write operations. In this
                 article, we introduce transaction coalescing, a
                 profile-guided compiler optimization technique that
                 attempts to reduce the overheads of starting and
                 committing a transaction by merging two or more small
                 transactions into one large transaction. We develop a
                 profiling tool and a transaction coalescing heuristic
                 to identify candidate transactions suitable for
                 coalescing. We implement a compiler extension to
                 automatically merge the candidate transactions at the
                 compile time. We evaluate the effectiveness of our
                 technique using the hash table micro-benchmark and the
                 STAMP benchmark suite. Transaction coalescing improves
                 the performance of the hash table significantly and the
                 performance of Vacation and SSCA2 benchmarks by 19.4\%
                 and 36.4\%, respectively, when running with 12
                 threads.",
  acknowledgement = ack-nhfb,
  articleno =    "50",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2013:WWA,
  author =       "Zhe Wang and Shuchang Shan and Ting Cao and Junli Gu
                 and Yi Xu and Shuai Mu and Yuan Xie and Daniel A.
                 Jim{\'e}nez",
  title =        "{WADE}: Writeback-aware dynamic cache management for
                 {NVM}-based main memory system",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "51:1--51:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555307",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging Non-Volatile Memory (NVM) technologies are
                 explored as potential alternatives to traditional
                 SRAM/DRAM-based memory architecture in future
                 microprocessor design. One of the major disadvantages
                 for NVM is the latency and energy overhead associated
                 with write operations. Mitigation techniques to
                 minimize the write overhead for NVM-based main memory
                 architecture have been studied extensively. However,
                 most prior work focuses on optimization techniques for
                 NVM-based main memory itself, with little attention
                 paid to cache management policies for the Last-Level
                 Cache (LLC). In this article, we propose a
                 Writeback-Aware Dynamic CachE (WADE) management
                 technique to help mitigate the write overhead in
                 NVM-based {memory.$<$ sup};{$>$1$<$}/sup;{$>$} The
                 proposal is based on the observation that, when dirty
                 cache blocks are evicted from the LLC and written into
                 NVM-based memory (with PCM as an example), the long
                 latency and high energy associated with write
                 operations to NVM-based memory can cause system
                 performance/power degradation. Thus, reducing the
                 number of writeback requests from the LLC is critical.
                 The proposed WADE cache management technique tries to
                 keep highly reused dirty cache blocks in the LLC. The
                 technique predicts blocks that are frequently written
                 back in the LLC. The LLC sets are dynamically
                 partitioned into a frequent writeback list and a
                 nonfrequent writeback list. It keeps a best size of
                 each list in the LLC. Our evaluation shows that the
                 technique can reduce the number of writeback requests
                 by 16.5\% for memory-intensive single-threaded
                 benchmarks and 10.8\% for multicore workloads. It
                 yields a geometric mean speedup of 5.1\% for
                 single-thread applications and 7.6\% for multicore
                 workloads. Due to the reduced number of writeback
                 requests to main memory, the technique reduces the
                 energy consumption by 8.1\% for single-thread
                 applications and 7.6\% for multicore workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "51",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Li:2013:CCC,
  author =       "Yong Li and Yaojun Zhang and Hai LI and Yiran Chen and
                 Alex K. Jones",
  title =        "{C1C}: a configurable, compiler-guided {STT-RAM L1}
                 cache",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "52:1--52:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555308",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Spin-Transfer Torque RAM (STT-RAM), a promising
                 alternative to SRAM for reducing leakage power
                 consumption, has been widely studied to mitigate the
                 impact of its asymmetrically long write latency.
                 Recently, STT-RAM has been proposed for L1 caches by
                 relaxing the data retention time to improve write
                 performance and dynamic energy. However, as the
                 technology scales down from 65nm to 22nm, the
                 performance of the read operation scales poorly due to
                 reduced sense margins and sense amplifier delays. In
                 this article, we leverage a dual-mode STT memory cell
                 to design a configurable L1 cache architecture termed
                 C1C to mitigate read performance barriers with
                 technology scaling. Guided by application access
                 characteristics discovered through novel compiler
                 analyses, the proposed cache adaptively switches
                 between a high performance and a low-power access mode.
                 Our evaluation demonstrates that the proposed cache
                 with compiler guidance outperforms a state-of-the-art
                 STT-RAM cache design by 9\% with high dynamic energy
                 efficiency, leading to significant performance/watt
                 improvements over several competing approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "52",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fauzia:2013:BRD,
  author =       "Naznin Fauzia and Venmugil Elango and Mahesh
                 Ravishankar and J. Ramanujam and Fabrice Rastello and
                 Atanas Rountev and Louis-No{\"e}l Pouchet and P.
                 Sadayappan",
  title =        "Beyond reuse distance analysis: Dynamic analysis for
                 characterization of data locality potential",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "53:1--53:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555309",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Emerging computer architectures will feature
                 drastically decreased flops/byte (ratio of peak
                 processing rate to memory bandwidth) as highlighted by
                 recent studies on Exascale architectural trends.
                 Further, flops are getting cheaper, while the energy
                 cost of data movement is increasingly dominant. The
                 understanding and characterization of data locality
                 properties of computations is critical in order to
                 guide efforts to enhance data locality. Reuse distance
                 analysis of memory address traces is a valuable tool to
                 perform data locality characterization of programs. A
                 single reuse distance analysis can be used to estimate
                 the number of cache misses in a fully associative LRU
                 cache of any size, thereby providing estimates on the
                 minimum bandwidth requirements at different levels of
                 the memory hierarchy to avoid being bandwidth bound.
                 However, such an analysis only holds for the particular
                 execution order that produced the trace. It cannot
                 estimate potential improvement in data locality through
                 dependence-preserving transformations that change the
                 execution schedule of the operations in the
                 computation. In this article, we develop a novel
                 dynamic analysis approach to characterize the inherent
                 locality properties of a computation and thereby assess
                 the potential for data locality enhancement via
                 dependence-preserving transformations. The execution
                 trace of a code is analyzed to extract a
                 Computational-Directed Acyclic Graph (CDAG) of the data
                 dependences. The CDAG is then partitioned into convex
                 subsets, and the convex partitioning is used to reorder
                 the operations in the execution trace to enhance data
                 locality. The approach enables us to go beyond reuse
                 distance analysis of a single specific order of
                 execution of the operations of a computation in
                 characterization of its data locality properties. It
                 can serve a valuable role in identifying promising code
                 regions for manual transformation, as well as assessing
                 the effectiveness of compiler transformations for data
                 locality enhancement. We demonstrate the effectiveness
                 of the approach using a number of benchmarks, including
                 case studies where the potential shown by the analysis
                 is exploited to achieve lower data movement costs and
                 better performance.",
  acknowledgement = ack-nhfb,
  articleno =    "53",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bardizbanyan:2013:DPD,
  author =       "Alen Bardizbanyan and Magnus Sj{\"a}lander and David
                 Whalley and Per Larsson-Edefors",
  title =        "Designing a practical data filter cache to improve
                 both energy efficiency and performance",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "54:1--54:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555310",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Conventional Data Filter Cache (DFC) designs improve
                 processor energy efficiency, but degrade performance.
                 Furthermore, the single-cycle line transfer suggested
                 in prior studies adversely affects Level-1 Data Cache
                 (L1 DC) area and energy efficiency. We propose a
                 practical DFC that is accessed early in the pipeline
                 and transfers a line over multiple cycles. Our DFC
                 design improves performance and eliminates a
                 substantial fraction of L1 DC accesses for loads, L1 DC
                 tag checks on stores, and data translation lookaside
                 buffer accesses for both loads and stores. Our
                 evaluation shows that the proposed DFC can reduce the
                 data access energy by 42.5\% and improve execution time
                 by 4.2\%.",
  acknowledgement = ack-nhfb,
  articleno =    "54",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hagiescu:2013:GCG,
  author =       "Andrei Hagiescu and Bing Liu and R. Ramanathan and
                 Sucheendra K. Palaniappan and Zheng Cui and Bipasa
                 Chattopadhyay and P. S. Thiagarajan and Weng-Fai Wong",
  title =        "{GPU} code generation for {ODE}-based applications with
                 phased shared-data access patterns",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "55:1--55:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555311",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present a novel code generation scheme for GPUs.
                 Its key feature is the platform-aware generation of a
                 heterogeneous pool of threads. This exposes more
                 data-sharing opportunities among the concurrent threads
                 and reduces the memory requirements that would
                 otherwise exceed the capacity of the on-chip memory.
                 Instead of the conventional strategy of focusing on
                 exposing as much parallelism as possible, our scheme
                 leverages on the phased nature of memory access
                 patterns found in many applications that exhibit
                 massive parallelism. We demonstrate the effectiveness
                 of our code generation strategy on a computational
                 systems biology application. This application consists
                 of computing a Dynamic Bayesian Network (DBN)
                 approximation of the dynamics of signalling pathways
                 described as a system of Ordinary Differential
                 Equations (ODEs). The approximation algorithm involves
                 (i) sampling many (of the order of a few million) times
                 from the set of initial states, (ii) generating
                 trajectories through numerical integration, and (iii)
                 storing the statistical properties of this set of
                 trajectories in Conditional Probability Tables (CPTs)
                 of a DBN via a prespecified discretization of the time
                 and value domains. The trajectories can be computed in
                 parallel. However, the intermediate data needed for
                 computing them, as well as the entries for the CPTs,
                 are too large to be stored locally. Our experiments
                 show that the proposed code generation scheme scales
                 well, achieving significant performance improvements on
                 three realistic signalling pathways models. These
                 results suggest how our scheme could be extended to
                 deal with other applications involving systems of
                 ODEs.",
  acknowledgement = ack-nhfb,
  articleno =    "55",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Lee:2013:TLS,
  author =       "Junghee Lee and Chrysostomos Nicopoulos and Hyung Gyu
                 Lee and Jongman Kim",
  title =        "{TornadoNoC}: a lightweight and scalable on-chip
                 network architecture for the many-core era",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "56:1--56:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555312",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The rapid emergence of Chip Multi-Processors (CMP) as
                 the de facto microprocessor archetype has highlighted
                 the importance of scalable and efficient on-chip
                 networks. Packet-based Networks-on-Chip (NoC) are
                 gradually cementing themselves as the medium of choice
                 for the multi-/many-core systems of the near future,
                 due to their innate scalability. However, the
                 prominence of the debilitating power wall requires the
                 NoC to also be as energy efficient as possible. To
                 achieve these two antipodal requirements-scalability
                 and energy efficiency-we propose TornadoNoC, an
                 interconnect architecture that employs a novel flow
                 control mechanism. To prevent livelocks and deadlocks,
                 a sequence numbering scheme and a dynamic ring
                 inflation technique are proposed, and their correctness
                 formally proven. The primary objective of TornadoNoC is
                 to achieve substantial gains in (a) scalability to
                 many-core systems and (b) the area/power footprint, as
                 compared to current state-of-the-art router
                 implementations. The new router is demonstrated to
                 provide better scalability to hundreds of cores than an
                 ideal single-cycle wormhole implementation and other
                 scalability-enhanced low-cost routers. Extensive
                 simulations using both synthetic traffic patterns and
                 real applications running in a full-system simulator
                 corroborate the efficacy of the proposed design.
                 Finally, hardware synthesis analysis using commercial
                 65nm standard-cell libraries indicates that the area
                 and power budgets of the new router are reduced by up
                 to 53\% and 58\%, respectively, as compared to existing
                 state-of-the-art low-cost routers.",
  acknowledgement = ack-nhfb,
  articleno =    "56",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Strydis:2013:SAP,
  author =       "Christos Strydis and Robert M. Seepers and Pedro
                 Peris-Lopez and Dimitrios Siskos and Ioannis Sourdis",
  title =        "A system architecture, processor, and communication
                 protocol for secure implants",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "57:1--57:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555313",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Secure and energy-efficient communication between
                 Implantable Medical Devices (IMDs) and authorized
                 external users is attracting increasing attention these
                 days. However, there currently exists no systematic
                 approach to the problem, while solutions from
                 neighboring fields, such as wireless sensor networks,
                 are not directly transferable due to the peculiarities
                 of the IMD domain. This work describes an original,
                 efficient solution for secure IMD communication. A new
                 implant system architecture is proposed, where security
                 and main-implant functionality are made completely
                 decoupled by running the tasks onto two separate cores.
                 Wireless communication goes through a custom security
                 ASIP, called SISC (Smart-Implant Security Core), which
                 runs an energy-efficient security protocol. The
                 security core is powered by RF-harvested energy until
                 it performs external-reader authentication, providing
                 an elegant defense mechanism against battery
                 Denial-of-Service (DoS) and other, more common attacks.
                 The system has been evaluated based on a realistic case
                 study involving an artificial pancreas implant. When
                 synthesized for a UMC 90nm CMOS ASIC technology, our
                 system architecture achieves defense against
                 unauthorized accesses having zero energy cost, running
                 entity authentication through harvesting only 7.45 $
                 \mu $J of RF energy from the requesting entity. In all
                 other successfully authenticated accesses, our
                 architecture achieves secure data exchange without
                 affecting the performance of the main IMD
                 functionality, adding less than 1o/oo (1.3 mJ ) to the
                 daily energy consumption of a typical implant. Compared
                 to a singe-core, secure reference IMD, which would
                 still be more vulnerable to some types of attacks, our
                 secure system on chip (SoC) achieves high security
                 levels at 56\% energy savings and at an area overhead
                 of less than 15\%.",
  acknowledgement = ack-nhfb,
  articleno =    "57",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kim:2013:FMS,
  author =       "Wonsub Kim and Yoonseo Choi and Haewoo Park",
  title =        "Fast modulo scheduler utilizing patternized routes for
                 coarse-grained reconfigurable architectures",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "58:1--58:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555314",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Coarse-Grained Reconfigurable Architectures (CGRAs)
                 present a potential of high compute throughput with
                 energy efficiency. A CGRA consists of an array of
                 Functional Units (FUs), which communicate with each
                 other through an interconnect network containing
                 transmission nodes and register files. To achieve high
                 performance from the software solutions mapped onto
                 CGRAs, modulo scheduling of loops is generally
                 employed. One of the key challenges in modulo
                 scheduling for CGRAs is to explicitly handle routings
                 of operands from a source to a destination operations
                 through various routing resources. Existing modulo
                 schedulers for CGRAs are slow because finding a valid
                 routing is generally a searching problem over a large
                 space, even with the guidance of well-defined cost
                 metrics. Applications in traditional embedded
                 multimedia domains are regarded as relatively tolerant
                 to a slow compile time in exchange for a high-quality
                 solution. However, many rapidly growing domains of
                 applications, such as 3D graphics, require a fast
                 compilation. Entrances of CGRAs to these domains have
                 been blocked mainly due to their long compile time. We
                 attack this problem by utilizing patternized routes,
                 for which resources and time slots for a success can be
                 estimated in advance when a source operation is placed.
                 By conservatively reserving predefined resources at
                 predefined time slots, future routings originating from
                 the source operation are guaranteed. Experiments on a
                 real-world 3D graphics benchmark suite show that our
                 scheduler improves the compile time up to 6,000 times
                 while achieving an average 70\% throughputs of the
                 state-of-the-art CGRA modulo scheduler, the
                 Edge-centric Modulo Scheduler (EMS).",
  acknowledgement = ack-nhfb,
  articleno =    "58",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Nuzman:2013:JTC,
  author =       "Dorit Nuzman and Revital Eres and Sergei Dyshel and
                 Marcel Zalmanovici and Jose Castanos",
  title =        "{JIT} technology with {C\slash C++}: Feedback-directed
                 dynamic recompilation for statically compiled
                 languages",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "59:1--59:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555315",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The growing gap between the advanced capabilities of
                 static compilers as reflected in benchmarking results
                 and the actual performance that users experience in
                 real-life scenarios makes client-side dynamic
                 optimization technologies imperative to the domain of
                 static languages. Dynamic optimization of software
                 distributed in the form of a platform-agnostic
                 Intermediate-Representation (IR) has been very
                 successful in the domain of managed languages, greatly
                 improving upon interpreted code, especially when online
                 profiling is used. However, can such feedback-directed
                 IR-based dynamic code generation be viable in the
                 domain of statically compiled, rather than interpreted,
                 languages? We show that fat binaries, which combine the
                 IR together with the statically compiled executable,
                 can provide a practical solution for software vendors,
                 allowing their software to be dynamically optimized
                 without the limitation of binary-level approaches,
                 which lack the high-level IR of the program, and
                 without the warm-up costs associated with the IR-only
                 software distribution approach. We describe and
                 evaluate the fat-binary-based runtime compilation
                 approach using SPECint2006, demonstrating that the
                 overheads it incurs are low enough to be successfully
                 surmounted by dynamic optimization. Building on Java
                 JIT technologies, our results already improve upon
                 common real-world usage scenarios, including very small
                 workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "59",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ramashekar:2013:ADA,
  author =       "Thejas Ramashekar and Uday Bondhugula",
  title =        "Automatic data allocation and buffer management for
                 multi-{GPU} machines",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "60:1--60:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2544100",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Multi-GPU machines are being increasingly used in
                 high-performance computing. Each GPU in such a machine
                 has its own memory and does not share the address space
                 either with the host CPU or other GPUs. Hence,
                 applications utilizing multiple GPUs have to manually
                 allocate and manage data on each GPU. Existing works
                 that propose to automate data allocations for GPUs have
                 limitations and inefficiencies in terms of allocation
                 sizes, exploiting reuse, transfer costs, and
                 scalability. We propose a scalable and fully automatic
                 data allocation and buffer management scheme for affine
                 loop nests on multi-GPU machines. We call it the
                 Bounding-Box-based Memory Manager (BBMM). BBMM can
                 perform at runtime, during standard set operations like
                 union, intersection, and difference, finding subset and
                 superset relations on hyperrectangular regions of array
                 data (bounding boxes). It uses these operations along
                 with some compiler assistance to identify, allocate,
                 and manage data required by applications in terms of
                 disjoint bounding boxes. This allows it to (1) allocate
                 exactly or nearly as much data as is required by
                 computations running on each GPU, (2) efficiently track
                 buffer allocations and hence maximize data reuse across
                 tiles and minimize data transfer overhead, and (3) and
                 as a result, maximize utilization of the combined
                 memory on multi-GPU machines. BBMM can work with any
                 choice of parallelizing transformations, computation
                 placement, and scheduling schemes, whether static or
                 dynamic. Experiments run on a four-GPU machine with
                 various scientific programs showed that BBMM reduces
                 data allocations on each GPU by up to 75\% compared to
                 current allocation schemes, yields performance of at
                 least 88\% of manually written code, and allows
                 excellent weak scaling.",
  acknowledgement = ack-nhfb,
  articleno =    "60",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Vandierendonck:2013:ADT,
  author =       "Hans Vandierendonck and George Tzenakis and Dimitrios
                 S. Nikolopoulos",
  title =        "Analysis of dependence tracking algorithms for task
                 dataflow execution",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "61:1--61:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555316",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Processor architectures has taken a turn toward
                 many-core processors, which integrate multiple
                 processing cores on a single chip to increase overall
                 performance, and there are no signs that this trend
                 will stop in the near future. Many-core processors are
                 harder to program than multicore and single-core
                 processors due to the need for writing parallel or
                 concurrent programs with high degrees of parallelism.
                 Moreover, many-cores have to operate in a mode of
                 strong scaling because of memory bandwidth constraints.
                 In strong scaling, increasingly finer-grain parallelism
                 must be extracted in order to keep all processing cores
                 busy. Task dataflow programming models have a high
                 potential to simplify parallel programming because they
                 alleviate the programmer from identifying precisely all
                 intertask dependences when writing programs. Instead,
                 the task dataflow runtime system detects and enforces
                 intertask dependences during execution based on the
                 description of memory accessed by each task. The
                 runtime constructs a task dataflow graph that captures
                 all tasks and their dependences. Tasks are scheduled to
                 execute in parallel, taking into account dependences
                 specified in the task graph. Several papers report
                 important overheads for task dataflow systems, which
                 severely limits the scalability and usability of such
                 systems. In this article, we study efficient schemes to
                 manage task graphs and analyze their scalability. We
                 assume a programming model that supports input, output,
                 and in/out annotations on task arguments, as well as
                 commutative in/out and reductions. We analyze the
                 structure of task graphs and identify versions and
                 generations as key concepts for efficient management of
                 task graphs. Then, we present three schemes to manage
                 task graphs building on graph representations,
                 hypergraphs, and lists. We also consider a fourth
                 edgeless scheme that synchronizes tasks using integers.
                 Analysis using microbenchmarks shows that the graph
                 representation is not always scalable and that the
                 edgeless scheme introduces least overhead in nearly all
                 situations.",
  acknowledgement = ack-nhfb,
  articleno =    "61",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jeong:2013:EET,
  author =       "Yeonghun Jeong and Seongseok Seo and Jongeun Lee",
  title =        "Evaluator-executor transformation for efficient
                 pipelining of loops with conditionals",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "62:1--62:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555317",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Control divergence poses many problems in
                 parallelizing loops. While predicated execution is
                 commonly used to convert control dependence into data
                 dependence, it often incurs high overhead because it
                 allocates resources equally for both branches of a
                 conditional statement regardless of their execution
                 frequencies. For those loops with unbalanced
                 conditionals, we propose a software transformation that
                 divides a loop into two or three smaller loops so that
                 the condition is evaluated only in the first loop,
                 while the less frequent branch is executed in the
                 second loop in a way that is much more efficient than
                 in the original loop. To reduce the overhead of extra
                 data transfer caused by the loop fission, we also
                 present a hardware extension for a class of
                 Coarse-Grained Reconfigurable Architectures (CGRAs).
                 Our experiments using MiBench and computer vision
                 benchmarks on a CGRA demonstrate that our techniques
                 can improve the performance of loops over predicated
                 execution by up to 65\% (37.5\%, on average), when the
                 hardware extension is enabled. Without any hardware
                 modification, our software-only version can improve
                 performance by up to 64\% (33\%, on average), while
                 simultaneously reducing the energy consumption of the
                 entire CGRA including configuration and data memory by
                 22\%, on average.",
  acknowledgement = ack-nhfb,
  articleno =    "62",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Barik:2013:DNS,
  author =       "Rajkishore Barik and Jisheng Zhao and Vivek Sarkar",
  title =        "A decoupled non-{SSA} global register allocation using
                 bipartite liveness graphs",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "63:1--63:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2544101",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Register allocation is an essential optimization for
                 all compilers. A number of sophisticated register
                 allocation algorithms have been developed over the
                 years. The two fundamental classes of register
                 allocation algorithms used in modern compilers are
                 based on Graph Coloring (GC) and Linear Scan (LS).
                 However, these two algorithms have fundamental
                 limitations in terms of precision. For example, the key
                 data structure used in GC-based algorithms, the
                 interference graph, lacks information on the program
                 points at which two variables may interfere. The
                 LS-based algorithms make local decisions regarding
                 spilling, and thereby trade off global optimization for
                 reduced compile-time and space overheads. Recently,
                 researchers have proposed Static Single Assignment
                 (SSA)-based decoupled register allocation algorithms
                 that exploit the live-range split points of the SSA
                 representation to optimally solve the spilling problem.
                 However, SSA-based register allocation often requires
                 extra complexity in repairing register assignments
                 during SSA elimination and in addressing architectural
                 constraints such as aliasing and ABI encoding; this
                 extra overhead can be prohibitively expensive in
                 dynamic compilation contexts. This article proposes a
                 decoupled non-SSA--based global register allocation
                 algorithm for dynamic compilation. It addresses the
                 limitations in current algorithms by introducing a
                 Bipartite Liveness Graph (BLG)-based register
                 allocation algorithm that models the spilling phase as
                 an optimization problem on the BLG itself and the
                 assignment phase as a separate optimization problem.
                 Advanced register allocation optimizations such as move
                 coalescing, live-range splitting, and register class
                 handling are also performed along with the spilling and
                 assignment phases. In the presence of register classes,
                 we propose a bucket-based greedy heuristic for
                 assignment that strikes a balance between spill-cost
                 and register class constraints. We present experimental
                 evaluation of our BLG-based register allocation
                 algorithm and compare it with production-quality
                 register allocators in Jikes RVM and LLVM.",
  acknowledgement = ack-nhfb,
  articleno =    "63",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gavin:2013:RIF,
  author =       "Peter Gavin and David Whalley and Magnus
                 Sj{\"a}lander",
  title =        "Reducing instruction fetch energy in multi-issue
                 processors",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "64:1--64:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2541228.2555318",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Thu Jan 9 10:42:35 MST 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The need to minimize power while maximizing
                 performance has led to recent developments of powerful
                 superscalar designs targeted at embedded and portable
                 use. Instruction fetch is responsible for a significant
                 fraction of microprocessor power and energy, and is
                 therefore an attractive target for architectural power
                 optimization. We present novel techniques that take
                 advantage of guarantees so that the instruction
                 translation lookaside buffer, branch target buffer, and
                 branch prediction buffer can frequently be disabled,
                 reducing their energy usage, while simultaneously
                 reducing branch predictor contention. These techniques
                 require no changes to the instruction set and can
                 easily be integrated into most single- and
                 multiple-issue processors.",
  acknowledgement = ack-nhfb,
  articleno =    "64",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Anonymous:2013:LDR,
  author =       "Anonymous",
  title =        "List of distinguished reviewers {ACM TACO}",
  journal =      j-TACO,
  volume =       "10",
  number =       "4",
  pages =        "65:1--65:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2560216",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:44 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  acknowledgement = ack-nhfb,
  articleno =    "65",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Goel:2014:SPR,
  author =       "Neeraj Goel and Anshul Kumar and Preeti Ranjan Panda",
  title =        "Shared-port register file architecture for low-energy
                 {VLIW} processors",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2533397",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We propose a reduced-port Register File (RF)
                 architecture for reducing RF energy in a VLIW
                 processor. With port reduction, RF ports need to be
                 shared among Function Units (FUs), which may lead to
                 access conflicts, and thus, reduced performance. Our
                 solution includes (i) a carefully designed RF-FU
                 interconnection network that permits port sharing with
                 minimum conflicts and without any delay/energy
                 overheads, and (ii) a novel scheduling and binding
                 algorithm that reduces the performance penalty. With
                 our solution, we observed as much as 83\% RF energy
                 savings with no more than a 10\% loss in performance
                 for a set of Mediabench and Mibench benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2014:IPD,
  author =       "Zheng Wang and Georgios Tournavitis and Bj{\"o}rn
                 Franke and Michael F. P. O'boyle",
  title =        "Integrating profile-driven parallelism detection and
                 machine-learning-based mapping",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579561",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Compiler-based auto-parallelization is a much-studied
                 area but has yet to find widespread application. This
                 is largely due to the poor identification and
                 exploitation of application parallelism, resulting in
                 disappointing performance far below that which a
                 skilled expert programmer could achieve. We have
                 identified two weaknesses in traditional parallelizing
                 compilers and propose a novel, integrated approach
                 resulting in significant performance improvements of
                 the generated parallel code. Using profile-driven
                 parallelism detection, we overcome the limitations of
                 static analysis, enabling the identification of more
                 application parallelism, and only rely on the user for
                 final approval. We then replace the traditional
                 target-specific and inflexible mapping heuristics with
                 a machine-learning-based prediction mechanism,
                 resulting in better mapping decisions while automating
                 adaptation to different target architectures. We have
                 evaluated our parallelization strategy on the NAS and
                 SPEC CPU2000 benchmarks and two different multicore
                 platforms (dual quad-core Intel Xeon SMP and
                 dual-socket QS20 Cell blade). We demonstrate that our
                 approach not only yields significant improvements when
                 compared with state-of-the-art parallelizing compilers
                 but also comes close to and sometimes exceeds the
                 performance of manually parallelized codes. On average,
                 our methodology achieves 96\% of the performance of the
                 hand-tuned OpenMP NAS and SPEC parallel benchmarks on
                 the Intel Xeon platform and gains a significant speedup
                 for the IBM Cell platform, demonstrating the potential
                 of profile-guided and machine-learning- based
                 parallelization for complex multicore platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Samadi:2014:LGU,
  author =       "Mehrzad Samadi and Amir Hormati and Janghaeng Lee and
                 Scott Mahlke",
  title =        "Leveraging {GPUs} using cooperative loop speculation",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579617",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Graphics processing units, or GPUs, provide TFLOPs of
                 additional performance potential in commodity computer
                 systems that frequently go unused by most applications.
                 Even with the emergence of languages such as CUDA and
                 OpenCL, programming GPUs remains a difficult challenge
                 for a variety of reasons, including the inherent
                 algorithmic characteristics and data structure choices
                 used by applications as well as the tedious performance
                 optimization cycle that is necessary to achieve high
                 performance. The goal of this work is to increase the
                 applicability of GPUs beyond CUDA/OpenCL to implicitly
                 data-parallel applications written in C/C++ using
                 speculative parallelization. To achieve this goal, we
                 propose Paragon: a static/dynamic compiler platform to
                 speculatively run possibly data-parallel portions of
                 sequential applications on the GPU while cooperating
                 with the system CPU. For such loops, Paragon utilizes
                 the GPU in an opportunistic way while orchestrating a
                 cooperative relation between the CPU and GPU to reduce
                 the overhead of miss-speculations. Paragon monitors the
                 dependencies for the loops running speculatively on the
                 GPU and nonspeculatively on the CPU using a lightweight
                 distributed conflict detection designed specifically
                 for GPUs, and transfers the execution to the CPU in
                 case a conflict is detected. Paragon resumes the
                 execution on the GPU after the CPU resolves the
                 dependency. Our experiments show that Paragon achieves
                 4x on average and up to 30x speedup compared to unsafe
                 CPU execution with four threads and 7x on average and
                 up to 64x speedup versus sequential execution across a
                 set of sequential but implicitly data-parallel
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2014:EAC,
  author =       "Jue Wang and Xiangyu Dong and Yuan Xie and Norman P.
                 Jouppi",
  title =        "Endurance-aware cache line management for non-volatile
                 caches",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579671",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Nonvolatile memories (NVMs) have the potential to
                 replace low-level SRAM or eDRAM on-chip caches because
                 NVMs save standby power and provide large cache
                 capacity. However, limited write endurance is a common
                 problem for NVM technologies, and today's cache
                 management might result in unbalanced cache write
                 traffic, causing heavily written cache blocks to fail
                 much earlier than others. Although wear-leveling
                 techniques for NVM-based main memories exist, we cannot
                 simply apply them to NVM-based caches. This is because
                 cache writes have intraset variations as well as
                 interset variations, while writes to main memories only
                 have interset variations. To solve this problem, we
                 propose i$^2$ WAP, a new cache management policy that
                 can reduce both inter- and intraset write variations.
                 i$^2$ WAP has two features: Swap-Shift, an enhancement
                 based on existing main memory wear leveling to reduce
                 cache interset write variations, and Probabilistic Set
                 Line Flush, a novel technique to reduce cache intraset
                 write variations. Implementing i$^2$ WAP only needs two
                 global counters and two global registers. In one of our
                 studies, i$^2$ WAP can improve the NVM cache lifetime
                 by 75\% on average and up to 224\%. We also validate
                 that i$^2$ WAP is effective in systems with different
                 cache configurations and workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Liu:2014:BBS,
  author =       "Lei Liu and Zehan Cui and Yong Li and Yungang Bao and
                 Mingyu Chen and Chengyong Wu",
  title =        "{{BPM\slash BPM+}}: Software-based dynamic memory
                 partitioning mechanisms for mitigating {DRAM}
                 bank-\slash channel-level interferences in multicore
                 systems",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579672",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "The main memory system is a shared resource in modern
                 multicore machines that can result in serious
                 interference leading to reduced throughput and
                 unfairness. Many new memory scheduling mechanisms have
                 been proposed to address the interference problem.
                 However, these mechanisms usually employ relative
                 complex scheduling logic and need modifications to
                 Memory Controllers (MCs), which incur expensive
                 hardware design and manufacturing overheads. This
                 article presents a practical software approach to
                 effectively eliminate the interference without any
                 hardware modifications. The key idea is to modify the
                 OS memory management system and adopt a
                 page-coloring-based Bank-level Partitioning Mechanism
                 (BPM) that allocates dedicated DRAM banks to each core
                 (or thread). By using BPM, memory requests from
                 distinct programs are segregated across multiple memory
                 banks to promote locality/fairness and reduce
                 interference. We further extend BPM to BPM+ by
                 incorporating channel-level partitioning, on which we
                 demonstrate additional gain over BPM in many cases. To
                 achieve benefits in the presence of diverse application
                 memory needs and avoid performance degradation due to
                 resource underutilization, we propose a dynamic
                 mechanism upon BPM/BPM+ that assigns appropriate
                 bank/channel resources based on application
                 memory/bandwidth demands monitored through PMU
                 (performance-monitoring unit) and a low-overhead OS
                 page table scanning process. We implement BPM/BPM+ in
                 Linux 2.6.32.15 kernel and evaluate the technique on
                 four-core and eight-core real machines by running a
                 large amount of randomly generated multiprogrammed and
                 multithreaded workloads. Experimental results show that
                 BPM/BPM+ can improve the overall system throughput by
                 4.7\%/5.9\%, on average, (up to 8.6\%/9.5\%) and reduce
                 the unfairness by an average of 4.2\%/6.1\% (up to
                 15.8\%/13.9\%).",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Haubl:2014:TTE,
  author =       "Christian H{\"a}ubl and Christian Wimmer and Hanspeter
                 M{\"o}ssenb{\"o}ck",
  title =        "Trace transitioning and exception handling in a
                 trace-based {JIT} compiler for {Java}",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579673",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Trace-based Just-In-Time (JIT) compilation generates
                 machine code for frequently executed paths (so-called
                 traces) instead of whole methods. While this has
                 several advantages, it complicates invocation of
                 compiled traces as well as exception handling, so that
                 previous trace-based compilers limited the way in which
                 traces could be invoked. We present a significantly
                 enhanced trace-based compiler where arbitrary
                 transitions between interpreted and compiled traces are
                 possible. For that, we introduce suitable trace calling
                 conventions and extend exception handling to work both
                 within traces and across trace boundaries. Furthermore,
                 we use the recorded trace information for optimizations
                 and combine the tracing ideas with ideas from
                 partial-method compilation to avoid code bloat. An
                 extensive evaluation with the benchmark suites DaCapo
                 9.12 Bach and SPECjvm2008 shows that our trace-based
                 compiler achieves up to 59\% higher peak performance
                 than the method-based Java HotSpot client compiler. On
                 a few benchmarks, our fairly simple trace-based
                 compiler shows a higher peak performance than the Java
                 HotSpot server compiler, which is one of today's best
                 optimizing JIT compilers for Java.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Huang:2014:HHH,
  author =       "Yongbing Huang and Licheng Chen and Zehan Cui and Yuan
                 Ruan and Yungang Bao and Mingyu Chen and Ninghui Sun",
  title =        "{HMTT}: a hybrid hardware\slash software tracing
                 system for bridging the {DRAM} access trace's semantic
                 gap",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579668",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "DRAM access traces (i.e., off-chip memory references)
                 can be extremely valuable for the design of memory
                 subsystems and performance tuning of software. Hardware
                 snooping on the off-chip memory interface is an
                 effective and nonintrusive approach to monitoring and
                 collecting real-life DRAM accesses. However, compared
                 with software-based approaches, hardware snooping
                 approaches typically lack semantic information, such as
                 process/function/object identifiers, virtual addresses,
                 and lock contexts, that is essential to the complete
                 understanding of the systems and software under
                 investigation. In this article, we propose a hybrid
                 hardware/software mechanism that is able to collect
                 off-chip memory reference traces with semantic
                 information. We have designed and implemented a
                 prototype system called HMTT (Hybrid Memory Trace
                 Tool), which uses a custom-made DIMM connector to
                 collect off-chip memory references and a high-level
                 event-encoding scheme to correlate semantic information
                 with memory references. In addition to providing
                 complete, undistorted DRAM access traces, the proposed
                 system is also able to perform various types of
                 low-overhead profiling, such as object-relative
                 accesses and multithread lock accesses.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Chen:2014:AWA,
  author =       "Quan Chen and Minyi Guo",
  title =        "Adaptive workload-aware task scheduling for
                 single-{ISA} asymmetric multicore architectures",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579674",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Single-ISA Asymmetric Multicore (AMC) architectures
                 have shown high performance as well as power
                 efficiency. However, current parallel programming
                 environments do not perform well on AMC because they
                 are designed for symmetric multicore architectures in
                 which all cores provide equal performance. Their random
                 task scheduling policies can result in unbalanced
                 workloads in AMC and severely degrade the performance
                 of parallel applications. To balance the workloads of
                 parallel applications in AMC, this article proposes an
                 adaptive Workload-Aware Task Scheduler (WATS) that
                 consists of a history-based task allocator and a
                 preference-based task scheduler. The history-based task
                 allocator is based on a near-optimal, static task
                 allocation using the historical statistics collected
                 during the execution of a parallel application. The
                 preference-based task scheduler, which schedules tasks
                 based on a preference list, can dynamically adjust the
                 workloads in AMC if the task allocation is less optimal
                 due to approximation in the history-based task
                 allocator. Experimental results show that WATS can
                 improve both the performance and energy efficiency of
                 task-based applications, with the performance gain up
                 to 66.1\% compared with traditional task schedulers.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Jothi:2014:TCF,
  author =       "Komal Jothi and Haitham Akkary",
  title =        "Tuning the continual flow pipeline architecture with
                 virtual register renaming",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579675",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Continual Flow Pipelines (CFPs) allow a processor core
                 to process hundreds of in-flight instructions without
                 increasing cycle-critical pipeline resources. When a
                 load misses the data cache, CFP checkpoints the
                 processor register state and then moves all
                 miss-dependent instructions into a low-complexity WB to
                 unblock the pipeline. Meanwhile, miss-independent
                 instructions execute normally and update the processor
                 state. When the miss data return, CFP replays the
                 miss-dependent instructions from the WB and then merges
                 the miss-dependent and miss-independent execution
                 results. CFP was initially proposed for cache misses to
                 DRAM. Later work focused on reducing the execution
                 overhead of CFP by avoiding the pipeline flush before
                 replaying miss-dependent instructions and executing
                 dependent and independent instructions concurrently.
                 The goal of these improvements was to gain performance
                 by applying CFP to L1 data cache misses that hit the
                 last level on chip cache. However, many applications or
                 execution phases of applications incur excessive amount
                 of replay and/or rollbacks to the checkpoint. This
                 frequently cancels benefits from CFP and reduces
                 performance. In this article, we improve the CFP
                 architecture by using a novel virtual register renaming
                 substrate and by tuning the replay policies to mitigate
                 excessive replays and rollbacks to the checkpoint. We
                 describe these new design optimizations and show, using
                 Spec 2006 benchmarks and microarchitecture performance
                 and power models of our design, that our Tuned-CFP
                 architecture improves performance and energy
                 consumption over previous CFP architectures by ~10\%
                 and ~8\%, respectively. We also demonstrate that our
                 proposed architecture gives better performance return
                 on energy per instruction compared to a conventional
                 superscalar as well as previous CFP architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kritikakou:2014:SNO,
  author =       "Angeliki Kritikakou and Francky Catthoor and Vasilios
                 Kelefouras and Costas Goutis",
  title =        "A scalable and near-optimal representation of access
                 schemes for memory management",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2579677",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Memory management searches for the resources required
                 to store the concurrently alive elements. The solution
                 quality is affected by the representation of the
                 element accesses: a sub-optimal representation leads to
                 overestimation and a non-scalable representation
                 increases the exploration time. We propose a
                 methodology to near-optimal and scalable represent
                 regular and irregular accesses. The representation
                 consists of a set of pattern entries to compactly
                 describe the behavior of the memory accesses and of
                 pattern operations to consistently combine the pattern
                 entries. The result is a final sequence of pattern
                 entries which represents the global access scheme
                 without unnecessary overestimation.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Leather:2014:AFG,
  author =       "Hugh Leather and Edwin Bonilla and Michael O'boyle",
  title =        "Automatic feature generation for machine
                 learning--based optimising compilation",
  journal =      j-TACO,
  volume =       "11",
  number =       "1",
  pages =        "14:1--14:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2536688",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Fri Mar 14 17:30:52 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Recent work has shown that machine learning can
                 automate and in some cases outperform handcrafted
                 compiler optimisations. Central to such an approach is
                 that machine learning techniques typically rely upon
                 summaries or features of the program. The quality of
                 these features is critical to the accuracy of the
                 resulting machine learned algorithm; no machine
                 learning method will work well with poorly chosen
                 features. However, due to the size and complexity of
                 programs, theoretically there are an infinite number of
                 potential features to choose from. The compiler writer
                 now has to expend effort in choosing the best features
                 from this space. This article develops a novel
                 mechanism to automatically find those features that
                 most improve the quality of the machine learned
                 heuristic. The feature space is described by a grammar
                 and is then searched with genetic programming and
                 predictive modelling. We apply this technique to loop
                 unrolling in GCC 4.3.1 and evaluate our approach on a
                 Pentium 6. On a benchmark suite of 57 programs, GCCs
                 hard-coded heuristic achieves only 3\% of the maximum
                 performance available, whereas a state-of-the-art
                 machine learning approach with hand-coded features
                 obtains 59\%. Our feature generation technique is able
                 to achieve 76\% of the maximum available speedup,
                 outperforming existing approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ren:2014:POE,
  author =       "Bin Ren and Todd Mytkowicz and Gagan Agrawal",
  title =        "A Portable Optimization Engine for Accelerating
                 Irregular Data-Traversal Applications on {SIMD}
                 Architectures",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2632215",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Fine-grained data parallelism is increasingly common
                 in the form of longer vectors integrated with
                 mainstream processors (SSE, AVX) and various GPU
                 architectures. This article develops support for
                 exploiting such data parallelism for a class of
                 nonnumeric, nongraphic applications, which perform
                 computations while traversing many independent,
                 irregular data structures. We address this problem by
                 developing several novel techniques. First, for code
                 generation, we develop an intermediate language for
                 specifying such traversals, followed by a runtime
                 scheduler that maps traversals to various SIMD units.
                 Second, we observe that good data locality is crucial
                 to sustained performance from SIMD architectures,
                 whereas many applications that operate on irregular
                 data structures (e.g., trees and graphs) have poor data
                 locality. To address this challenge, we develop a set
                 of data layout optimizations that improve spatial
                 locality for applications that traverse many irregular
                 data structures. Unlike prior data layout
                 optimizations, our approach incorporates a notion of
                 both interthread and intrathread spatial reuse into
                 data layout. Finally, we enable performance portability
                 (i.e., the ability to automatically optimize
                 applications for different architectures) by accurately
                 modeling the impact of inter- and intrathread locality
                 on program performance. As a consequence, our model can
                 predict which data layout optimization to use on a wide
                 variety of SIMD architectures. To demonstrate the
                 efficacy of our approach and optimizations, we first
                 show how they enable up to a 12X speedup on one SIMD
                 architecture for a set of real-world applications. To
                 demonstrate that our approach enables performance
                 portability, we show how our model predicts the optimal
                 layout for applications across a diverse set of three
                 real-world SIMD architectures, which offers as much as
                 45\% speedup over a suboptimal solution.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Shen:2014:RSB,
  author =       "Bor-Yeh Shen and Wei-Chung Hsu and Wuu Yang",
  title =        "A Retargetable Static Binary Translator for the {ARM}
                 Architecture",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "18:1--18:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629335",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Machines designed with new but incompatible
                 Instruction Set Architecture (ISA) may lack proper
                 applications. Binary translation can address this
                 incompatibility by migrating applications from one
                 legacy ISA to a new one, although binary translation
                 has problems such as code discovery for variable-length
                 ISA and code location issues for handling indirect
                 branches. Dynamic Binary Translation (DBT) has been
                 widely adopted for migrating applications since it
                 avoids those problems. Static Binary Translation (SBT)
                 is a less general solution and has not been actively
                 researched. However, SBT performs more aggressive
                 optimizations, which could yield more compact code and
                 better code quality. Applications translated by SBT can
                 consume less memory, processor cycles, and power than
                 DBT and can be started more quickly. These advantages
                 are even more critical for embedded systems than for
                 general systems. In this article, we designed and
                 implemented a new SBT tool, called LLBT, which
                 translates ARM instructions into LLVM IRs and then
                 retargets the LLVM IRs to various ISAs, including x86,
                 x86-64, ARM, and MIPS. LLBT leverages two important
                 functionalities from LLVM: comprehensive optimizations
                 and retargetability. More importantly, LLBT solves the
                 code discovery problem for ARM/Thumb binaries without
                 resorting to interpretation. LLBT also effectively
                 reduced the size of the address mapping table, making
                 SBT a viable solution for embedded systems. Our
                 experiments based on the EEMBC benchmark suite show
                 that the LLBT-generated code can run more than $ 6
                 \times $ and $ 2.3 \times $ faster on average than
                 emulation with QEMU and HQEMU, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gracia:2014:RLN,
  author =       "Dar{\'\i}o Su{\'a}rez Gracia and Alexandra
                 Ferrer{\'o}n and Luis Montesano {Del Campo} and Teresa
                 Monreal Arnal and V{\'\i}ctor Vi{\~n}als Y{\'u}fera",
  title =        "Revisiting {LP--NUCA} Energy Consumption: Cache
                 Access Policies and Adaptive Block Dropping",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "19:1--19:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2632217",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Cache working-set adaptation is key as embedded
                 systems move to multiprocessor and Simultaneous
                 Multithreaded Architectures (SMT) because interthread
                 pollution harms system performance and battery life.
                 Light-Power NUCA (LP-NUCA) is a working-set adaptive
                 cache that depends on temporal-locality to save energy.
                 This work identifies the sources of energy waste in
                 LP-NUCAs: parallel access to the tag and data arrays of
                 the tiles and low locality phases with useless block
                 migration. To counteract both issues, we prove that
                 switching to serial access reduces energy without
                 harming performance and propose a machine learning
                 Adaptive Drop Rate (ADR) controller that minimizes the
                 amount of replacement and migration when locality is
                 low. This work demonstrates that these techniques
                 efficiently adapt the cache drop and access policies to
                 save energy. They reduce LP-NUCA consumption 22.7\% for
                 1SMT. With interthread cache contention in 2SMT, the
                 savings rise to 29\%. Versus a conventional
                 organization, energy--delay improves 20.8\% and 25\%
                 for 1- and 2SMT benchmarks, and, in 65\% of the 2SMT
                 mixes, gains are larger than 20\%.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Fang:2014:PPA,
  author =       "Shuangde Fang and Zidong Du and Yuntan Fang and
                 Yuanjie Huang and Yang Chen and Lieven Eeckhout and
                 Olivier Temam and Huawei Li and Yunji Chen and
                 Chengyong Wu",
  title =        "Performance Portability Across Heterogeneous {SoCs}
                 Using a Generalized Library-Based Approach",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2608253",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Jun 30 19:02:49 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Because of tight power and energy constraints,
                 industry is progressively shifting toward heterogeneous
                 system-on-chip (SoC) architectures composed of a mix of
                 general-purpose cores along with a number of
                 accelerators. However, such SoC architectures can be
                 very challenging to efficiently program for the vast
                 majority of programmers, due to numerous programming
                 approaches and languages. Libraries, on the other hand,
                 provide a simple way to let programmers take advantage
                 of complex architectures, which does not require
                 programmers to acquire new accelerator-specific or
                 domain-specific languages. Increasingly, library-based,
                 also called algorithm-centric, programming approaches
                 propose to generalize the usage of libraries and to
                 compose programs around these libraries, instead of
                 using libraries as mere complements. In this article,
                 we present a software framework for achieving
                 performance portability by leveraging a generalized
                 library-based approach. Inspired by the notion of a
                 component, as employed in software engineering and
                 HW/SW codesign, we advocate nonexpert programmers to
                 write simple wrapper code around existing libraries to
                 provide simple but necessary semantic information to
                 the runtime. To achieve performance portability, the
                 runtime employs machine learning (simulated annealing)
                 to select the most appropriate accelerator and its
                 parameters for a given algorithm. This selection
                 factors in the possibly complex composition of
                 algorithms used in the application, the communication
                 among the various accelerators, and the tradeoff
                 between different objectives (i.e., accuracy,
                 performance, and energy). Using a set of benchmarks run
                 on a real heterogeneous SoC composed of a multicore
                 processor and a GPU, we show that the runtime overhead
                 is fairly small at 5.1\% for the GPU and 6.4\% for the
                 multi-core. We then apply our accelerator selection
                 approach to a simulated SoC platform containing
                 multiple inexact accelerators. We show that accelerator
                 selection together with hardware parameter tuning
                 achieves an average 46.2\% energy reduction and a
                 speedup of 2.1$ \times $ while meeting the desired
                 application error target.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kaitoua:2014:HED,
  author =       "Abdulrahman Kaitoua and Hazem Hajj and Mazen A. R.
                 Saghir and Hassan Artail and Haitham Akkary and
                 Mariette Awad and Mageda Sharafeddine and Khaleel
                 Mershad",
  title =        "{Hadoop} Extensions for Distributed Computing on
                 Reconfigurable Active {SSD} Clusters",
  journal =      j-TACO,
  volume =       "11",
  number =       "2",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2608199",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:18 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "In this article, we propose new extensions to Hadoop
                 to enable clusters of reconfigurable active solid-state
                 drives (RASSDs) to process streaming data from SSDs
                 using FPGAs. We also develop an analytical model to
                 estimate the performance of RASSD clusters running
                 under Hadoop. Using the Hadoop RASSD platform and
                 network simulators, we validate our design and
                 demonstrate its impact on performance for different
                 workloads taken from Stanford's Phoenix MapReduce
                 project. Our results show that for a hardware
                 acceleration factor of 20$ \times $, compute-intensive
                 workloads processing 153MB of data can run up to 11$
                 \times $ faster than a standard Hadoop cluster.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Wang:2014:PSR,
  author =       "Jue Wang and Xiangyu Dong and Yuan Xie",
  title =        "Preventing {STT-RAM} Last-Level Caches from Port
                 Obstruction",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "23:1--23:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2633046",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Many new nonvolatile memory (NVM) technologies have
                 been heavily studied to replace the power-hungry
                 SRAM/DRAM-based memory hierarchy in today's computers.
                 Among various emerging NVM technologies, Spin-Transfer
                 Torque RAM (STT-RAM) has many benefits, such as fast
                 read latency, low leakage power, and high density,
                 making it a promising candidate for last-level caches
                 (LLCs).$^1$ However, STT-RAM write operation is
                 expensive. In particular, a long STT-RAM cache write
                 operation might obstruct other cache accesses and
                 result in severe performance degradation. Consequently,
                 how to mitigate STT-RAM write overhead is critical to
                 the success of STT-RAM adoption. In this article, we
                 propose an obstruction-aware cache management policy
                 called OAP. OAP monitors cache traffic, detects
                 LLC-obstructive processes, and differentiates the cache
                 accesses from different processes. Our experiment on a
                 four-core architecture with an 8MB STT-RAM L3 cache
                 shows a 14\% performance improvement and 64\% energy
                 reduction.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Gonzalez-Mesa:2014:ETM,
  author =       "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L.
                 Zapata and Oscar Plata",
  title =        "Effective Transactional Memory Execution Management
                 for Improved Concurrency",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "24:1--24:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2633048",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "This article describes a transactional memory
                 execution model intended to exploit maximum parallelism
                 from sequential and multithreaded programs. A program
                 code section is partitioned into chunks that will be
                 mapped onto threads and executed transactionally. These
                 transactions run concurrently and out of order, trying
                 to exploit maximum parallelism but managed by a
                 specific fully distributed commit control to meet data
                 dependencies. To accomplish correct parallel execution,
                 a partial precedence order relation is derived from the
                 program code section and/or defined by the programmer.
                 When a conflict between chunks is eagerly detected, the
                 precedence order relation is used to determine the best
                 policy to solve the conflict that preserves the
                 precedence order while maximizing concurrency. The
                 model defines a new transactional state called executed
                 but not committed. This state allows exploiting
                 concurrency on two levels: intrathread and interthread.
                 Intrathread concurrency is improved by having pending
                 uncommitted transactions while executing a new one in
                 the same thread. The new state improves interthread
                 concurrency because it permits out-of-order transaction
                 commits regarding the precedence order. Our model has
                 been implemented in a lightweight software
                 transactional memory system, TinySTM, and has been
                 evaluated on a set of benchmarks obtaining an important
                 performance improvement over the baseline TM system.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Kumar:2014:EPG,
  author =       "Rakesh Kumar and Alejandro Mart{\'\i}nez and Antonio
                 Gonz{\'a}lez",
  title =        "Efficient Power Gating of {SIMD} Accelerators Through
                 Dynamic Selective Devectorization in an {HW\slash SW}
                 Codesigned Environment",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "25:1--25:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629681",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Leakage energy is a growing concern in current and
                 future microprocessors. Functional units of
                 microprocessors are responsible for a major fraction of
                 this energy. Therefore, reducing functional unit
                 leakage has received much attention in recent years.
                 Power gating is one of the most widely used techniques
                 to minimize leakage energy. Power gating turns off the
                 functional units during the idle periods to reduce the
                 leakage. Therefore, the amount of leakage energy
                 savings is directly proportional to the idle time
                 duration. This article focuses on increasing the idle
                 interval for the higher SIMD lanes. The applications
                 are profiled dynamically, in a hardware/software
                 codesigned environment, to find the higher SIMD lanes'
                 usage pattern. If the higher lanes need to be turned on
                 for small time periods, the corresponding portion of
                 the code is devectorized to keep the higher lanes off.
                 The devectorized code is executed on the lowest SIMD
                 lane. Our experimental results show that the average
                 energy savings of the proposed mechanism are 15\%,
                 12\%, and 71\% greater than power gating for
                 SPECFP2006, Physicsbench, and Eigen benchmark suites,
                 respectively. Moreover, the slowdown caused by
                 devectorization is negligible.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Carlo:2014:FAA,
  author =       "Stefano {Di Carlo} and Salvatore Galfano and Marco
                 Indaco and Paolo Prinetto and Davide Bertozzi and Piero
                 Olivo and Cristian Zambelli",
  title =        "{FLARES}: an Aging Aware Algorithm to Autonomously
                 Adapt the Error Correction Capability in {NAND} Flash
                 Memories",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "26:1--26:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2631919",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With the advent of solid-state storage systems, NAND
                 flash memories are becoming a key storage technology.
                 However, they suffer from serious reliability and
                 endurance issues during the operating lifetime that can
                 be handled by the use of appropriate error correction
                 codes (ECCs) in order to reconstruct the information
                 when needed. Adaptable ECCs may provide the flexibility
                 to avoid worst-case reliability design, thus leading to
                 improved performance. However, a way to control such
                 adaptable ECCs' strength is required. This article
                 proposes FLARES, an algorithm able to adapt the ECC
                 correction capability of each page of a flash based on
                 a flash RBER prediction model and on a measurement of
                 the number of errors detected in a given time window.
                 FLARES has been fully implemented within the YAFFS 2
                 filesystem under the Linux operating system. This
                 allowed us to perform an extensive set of simulations
                 on a set of standard benchmarks that highlighted the
                 benefit of FLARES on the overall storage subsystem
                 performances.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Bartolini:2014:AFG,
  author =       "Davide B. Bartolini and Filippo Sironi and Donatella
                 Sciuto and Marco D. Santambrogio",
  title =        "Automated Fine-Grained {CPU} Provisioning for Virtual
                 Machines",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "27:1--27:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2637480",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Ideally, the pay-as-you-go model of Infrastructure as
                 a Service (IaaS) clouds should enable users to rent
                 just enough resources (e.g., CPU or memory bandwidth)
                 to fulfill their service level objectives (SLOs).
                 Achieving this goal is hard on current IaaS offers,
                 which require users to explicitly specify the amount of
                 resources to reserve; this requirement is nontrivial
                 for users, because estimating the amount of resources
                 needed to attain application-level SLOs is often
                 complex, especially when resources are virtualized and
                 the service provider colocates virtual machines (VMs)
                 on host nodes. For this reason, users who deploy VMs
                 subject to SLOs are usually prone to overprovisioning
                 resources, thus resulting in inflated business costs.
                 This article tackles this issue with AutoPro: a runtime
                 system that enhances IaaS clouds with automated and
                 fine-grained resource provisioning based on performance
                 SLOs. Our main contribution with AutoPro is filling the
                 gap between application-level performance SLOs and
                 allocation of a contended resource, without requiring
                 explicit reservations from users. In this article, we
                 focus on CPU bandwidth allocation to throughput-driven,
                 compute-intensive multithreaded applications colocated
                 on a multicore processor; we show that a theoretically
                 sound, yet simple, control strategy can enable
                 automated fine-grained allocation of this contended
                 resource, without the need for offline profiling.
                 Additionally, AutoPro helps service providers optimize
                 infrastructure utilization by provisioning idle
                 resources to best-effort workloads, so as to maximize
                 node-level utilization. Our extensive experimental
                 evaluation confirms that AutoPro is able to
                 automatically determine and enforce allocations to meet
                 performance SLOs while maximizing node-level
                 utilization by supporting batch workloads on a
                 best-effort basis.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Carlson:2014:EHL,
  author =       "Trevor E. Carlson and Wim Heirman and Stijn Eyerman
                 and Ibrahim Hur and Lieven Eeckhout",
  title =        "An Evaluation of High-Level Mechanistic Core Models",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "28:1--28:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2629677",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Large core counts and complex cache hierarchies are
                 increasing the burden placed on commonly used
                 simulation and modeling techniques. Although analytical
                 models provide fast results, they do not apply to
                 complex, many-core shared-memory systems. In contrast,
                 detailed cycle-level simulation can be accurate but
                 also tends to be slow, which limits the number of
                 configurations that can be evaluated. A middle ground
                 is needed that provides for fast simulation of complex
                 many-core processors while still providing accurate
                 results. In this article, we explore, analyze, and
                 compare the accuracy and simulation speed of
                 high-abstraction core models as a potential solution to
                 slow cycle-level simulation. We describe a number of
                 enhancements to interval simulation to improve its
                 accuracy while maintaining simulation speed. In
                 addition, we introduce the instruction-window centric
                 (IW-centric) core model, a new mechanistic core model
                 that bridges the gap between interval simulation and
                 cycle-accurate simulation by enabling high-speed
                 simulations with higher levels of detail. We also show
                 that using accurate core models like these are
                 important for memory subsystem studies, and that
                 simple, naive models, like a one-IPC core model, can
                 lead to misleading and incorrect results and
                 conclusions in practical design studies. Validation
                 against real hardware shows good accuracy, with an
                 average single-core error of 11.1\% and a maximum of
                 18.8\% for the IW-centric model with a 1.5$ \times $
                 slowdown compared to interval simulation.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Hijaz:2014:NLN,
  author =       "Farrukh Hijaz and Omer Khan",
  title =        "{NUCA-L1}: a Non-Uniform Access Latency Level-1 Cache
                 Architecture for Multicores Operating at Near-Threshold
                 Voltages",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "29:1--29:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2631918",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Research has shown that operating in the
                 near-threshold region is expected to provide up to 10$
                 \times $ energy efficiency for future processors.
                 However, reliable operation below a minimum voltage
                 (Vccmin) cannot be guaranteed due to process
                 variations. Because SRAM margins can easily be violated
                 at near-threshold voltages, their bit-cell failure
                 rates are expected to rise steeply. Multicore
                 processors rely on fast private L1 caches to exploit
                 data locality and achieve high performance. In the
                 presence of high bit-cell fault rates, traditionally an
                 L1 cache either sacrifices capacity or incurs
                 additional latency to correct the faults. We observe
                 that L1 cache sensitivity to hit latency offers a
                 design trade-off between capacity and latency. When
                 fault rate is high at extreme Vccmin, it is beneficial
                 to recover L1 cache capacity, even if it comes at the
                 cost of additional latency. However, at low fault
                 rates, the additional constant latency to recover cache
                 capacity degrades performance. With this trade-off in
                 mind, we propose a Non-Uniform Cache Access L1
                 architecture (NUCA-L1) that avoids additional latency
                 on accesses to fault-free cache lines. To mitigate the
                 capacity bottleneck, it deploys a correction mechanism
                 to recover capacity at the cost of additional latency.
                 Using extensive simulations of a 64-core multicore, we
                 demonstrate that at various bit-cell fault rates, our
                 proposed private NUCA-L1 cache architecture performs
                 better than state-of-the-art schemes, along with a
                 significant reduction in energy consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Drebes:2014:TAD,
  author =       "Andi Drebes and Karine Heydemann and Nathalie Drach
                 and Antoniu Pop and Albert Cohen",
  title =        "Topology-Aware and Dependence-Aware Scheduling and
                 Memory Allocation for Task-Parallel Languages",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "30:1--30:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2641764",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "We present a joint scheduling and memory allocation
                 algorithm for efficient execution of task-parallel
                 programs on non-uniform memory architecture (NUMA)
                 systems. Task and data placement decisions are based on
                 a static description of the memory hierarchy and on
                 runtime information about intertask communication.
                 Existing locality-aware scheduling strategies for
                 fine-grained tasks have strong limitations: they are
                 specific to some class of machines or applications,
                 they do not handle task dependences, they require
                 manual program annotations, or they rely on fragile
                 profiling schemes. By contrast, our solution makes no
                 assumption on the structure of programs or on the
                 layout of data in memory. Experimental results, based
                 on the OpenStream language, show that locality of
                 accesses to main memory of scientific applications can
                 be increased significantly on a 64-core machine,
                 resulting in a speedup of up to 1.63$ \times $ compared
                 to a state-of-the-art work-stealing scheduler.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Tawa:2014:EEF,
  author =       "Venkata Kalyan Tawa and Ravi Kasha and Madhu Mutyam",
  title =        "{EFGR}: an Enhanced Fine Granularity Refresh Feature
                 for High-Performance {DDR4 DRAM} Devices",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "31:1--31:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2656340",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "High-density DRAM devices spend significant time
                 refreshing the DRAM cells, leading to performance drop.
                 The JEDEC DDR4 standard provides a Fine Granularity
                 Refresh (FGR) feature to tackle refresh. Motivated by
                 the observation that in FGR mode, only a few banks are
                 involved, we propose an Enhanced FGR (EFGR) feature
                 that introduces three optimizations to the basic FGR
                 feature and exposes the bank-level parallelism within
                 the rank even during the refresh. The first
                 optimization decouples the nonrefreshing banks. The
                 second and third optimizations determine the maximum
                 number of nonrefreshing banks that can be active during
                 refresh and selectively precharge the banks before
                 refresh, respectively. Our simulation results show that
                 the EFGR feature is able to recover almost 56.6\% of
                 the performance loss incurred due to refresh
                 operations.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Yalcin:2014:EEC,
  author =       "Gulay Yalcin and Oguz Ergin and Emrah Islek and Osman
                 Sabri Unsal and Adrian Cristal",
  title =        "Exploiting Existing Comparators for Fine-Grained
                 Low-Cost Error Detection",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "32:1--32:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2656341",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Fault tolerance has become a fundamental concern in
                 computer design, in addition to performance and power.
                 Although several error detection schemes have been
                 proposed to discover a faulty core in the system, these
                 proposals could waste the whole core, including many
                 error-free structures in it after error detection.
                 Moreover, many fault-tolerant designs require
                 additional hardware for data replication or for
                 comparing the replicated data. In this study, we
                 provide a low-cost, fine-grained error detection scheme
                 by exploiting already existing comparators and data
                 replications in the several pipeline stages such as
                 issue queue, rename logic, and translation lookaside
                 buffer. We reduce the vulnerability of the source
                 register tags in IQ by 60\%, the vulnerability of
                 instruction TLB by 64\%, the vulnerability of data TLB
                 by 45\%, and the vulnerability of the register tags of
                 rename logic by 20\%.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Ramachandran:2014:HFR,
  author =       "Pradeep Ramachandran and Siva Kumar Sastry Hari and
                 Manlap Li and Sarita V. Adve",
  title =        "Hardware Fault Recovery for {I/O} Intensive
                 Applications",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "33:1--33:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2656342",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "With continued process scaling, the rate of hardware
                 failures in commodity systems is increasing. Because
                 these commodity systems are highly sensitive to cost,
                 traditional solutions that employ heavy redundancy to
                 handle such failures are no longer acceptable owing to
                 their high associated costs. Detecting such faults by
                 identifying anomalous software execution and recovering
                 through checkpoint-and-replay is emerging as a viable
                 low-cost alternative for future commodity systems. An
                 important but commonly ignored aspect of such solutions
                 is ensuring that external outputs to the system are
                 fault-free. The outputs must be delayed until the
                 detectors guarantee this, influencing fault-free
                 performance. The overheads for resiliency must thus be
                 evaluated while taking these delays into consideration;
                 prior work has largely ignored this relationship. This
                 article concerns recovery for I/O intensive
                 applications from in-core faults. We present a strategy
                 to buffer external outputs using dedicated hardware and
                 show that checkpoint intervals previously considered as
                 acceptable incur exorbitant overheads when hardware
                 buffering is considered. We then present two techniques
                 to reduce the checkpoint interval and demonstrate a
                 practical solution that provides high resiliency while
                 incurring low overheads.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}

@Article{Eyerman:2014:MTM,
  author =       "Stijn Eyerman and Pierre Michaud and Wouter Rogiest",
  title =        "Multiprogram Throughput Metrics: a Systematic
                 Approach",
  journal =      j-TACO,
  volume =       "11",
  number =       "3",
  pages =        "34:1--34:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2663346",
  ISSN =         "1544-3566 (print), 1544-3973 (electronic)",
  ISSN-L =       "1544-3566",
  bibdate =      "Mon Oct 27 17:02:20 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/taco.bib",
  abstract =     "Running multiple programs on a processor aims at
                 increasing the throughput of that processor. However,
                 defining meaningful throughput metrics in a simulation
                 environment is not as straightforward as reporting
                 execution time. This has led to an ongoing debate on
                 what forms a meaningful throughput metric for
                 multiprogram workloads. We present a method to
                 construct throughput metrics in a systematic way: we
                 start by expressing assumptions on job size, job
                 distribution, scheduling, and so forth that together
                 define a theoretical throughput experiment. The
                 throughput metric is then the average throughput of
                 this experiment. Different assumptions lead to
                 different metrics, so one should be aware of these
                 assumptions when making conclusions based on results
                 using a specific metric. Throughput metrics should
                 always be defined from explicit assumptions, because
                 this leads to a better understanding of the
                 implications and limits of the results obtained with
                 that metric. We elaborate multiple metrics based on
                 different assumptions. In particular, we identify the
                 assumptions that lead to the commonly used weighted
                 speedup and harmonic mean of speedups. Our study
                 clarifies that they are actual throughput metrics,
                 which was recently questioned. We also propose some new
                 throughput metrics, which cannot always be expressed as
                 a closed formula. We use real experimental data to
                 characterize metrics and show how they relate to each
                 other.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Architecture and Code Optimization
                 (TACO)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J924",
}