%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.19",
%%%     date            = "30 June 2014",
%%%     time            = "18:30:36 MDT",
%%%     filename        = "trets.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "23144 7029 35538 346825",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Reconfigurable Technology
%%%                        and Systems; bibliography; TRETS",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Reconfigurable Technology
%%%                        and Systems (CODEN ????, ISSN 1936-7406
%%%                        (print), 1936-7414 (electronic)), covering
%%%                        all journal issues from 2008 -- date.
%%%
%%%                        At version 1.19, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2008 (  17)    2011 (  29)    2014 (  15)
%%%                             2009 (  33)    2012 (  22)
%%%                             2010 (  37)    2013 (  19)
%%%
%%%                             Article:        172
%%%
%%%                             Total entries:  172
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://www.acm.org/trets/
%%%                            http://portal.acm.org/toc.cfm?id=J1151
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================

@Preamble{"\input bibnames.sty" #
    "\ifx \undefined \circled \def \circled #1{(#1)}\fi" #
    "\ifx \undefined \reg \def \reg {\circled{R}}\fi" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:

@String{j-TRETS                 = "ACM Transactions on Reconfigurable Technology
                                  and Systems (TRETS)"}

%%% ====================================================================
%%% Bibliography entries:

@Article{Buell:2008:I,
  author =       "Duncan Buell and Wayne Luk",
  title =        "Introduction",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1331897.1331898",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{DeHon:2008:GET,
  author =       "Andr{\'e} DeHon and Mike Hutton",
  title =        "Guest Editorial: {TRETS} Special Edition on the {15th
                 International Symposium on FPGAs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1331897.1341292",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Matsumoto:2008:SID,
  author =       "Yohei Matsumoto and Masakazu Hioki and Takashi
                 Kawanami and Hanpei Koike and Toshiyuki Tsutsumi and
                 Tadashi Nakagawa and Toshihiro Sekigawa",
  title =        "Suppression of Intrinsic Delay Variation in {FPGAs}
                 using Multiple Configurations",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1331897.1331899",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A new method for improving the timing yield of
                 field-programmable gate array (FPGA) devices affected
                 by intrinsic within-die variation is proposed. The
                 timing variation is reduced by selecting an appropriate
                 configuration for each chip from a set of independent
                 configurations, the critical paths of which do not
                 share the same circuit resources on the FPGA. In this
                 article, the actual method used to generate independent
                 multiple configurations by simply repeating the routing
                 phase is shown, along with the results of Monte Carlo
                 simulation with 10,000 samples. One simulation result
                 showed that the standard deviations of maximum critical
                 path delays are reduced by 28\% and 49\% for 10\% and
                 30\% V$_{th}$ variations ($ \sigma / \mu $ ),
                 respectively, with 10 independent configurations.
                 Therefore, the proposed method is especially effective
                 for larger V$_{th}$ variation and is expected to be
                 useful for suppressing the performance variation of
                 FPGAs due to the future increase of parameter
                 variation. Another simulation result showed that the
                 effectiveness of the proposed technique was saturated
                 at the use of 10 or more configurations because of the
                 degradation of the quality of the configurations.
                 Therefore, the use of 10 or fewer configurations is
                 reasonable.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "configuration; FPGA; timing yield; within-die
                 variation",
}

@Article{Sivaswamy:2008:SAP,
  author =       "Satish Sivaswamy and Kia Bazargan",
  title =        "Statistical Analysis and Process Variation-Aware
                 Routing and Skew Assignment for {FPGAs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1331897.1331900",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "With constant scaling of process technologies, chip
                 design is becoming increasingly difficult due to
                 process variations. The FPGA community has only
                 recently started focusing on the effects of variations.
                 In this work we present a statistical analysis to
                 compare the effects of variations on designs mapped to
                 FPGAs and ASICs. We also present CAD and architecture
                 techniques to mitigate the impact of variations. First
                 we present a variation-aware router that optimizes
                 statistical criticality. We then propose a modification
                 to the clock network to deliver programmable skews to
                 different flip-flops. Finally, we combine the two
                 techniques and the result is a 9x reduction in yield
                 loss that translates to a 12\% improvement in timing
                 yield. When the desired timing yield is set to 99\%,
                 our combined statistical routing and skew assignment
                 technique results in a delay improvement of about 10\%
                 over a purely deterministic approach.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "routing; skew assignment; statistical timing
                 analysis",
}

@Article{Lu:2008:DCR,
  author =       "Shih-Lien L. Lu and Peter Yiannacouras and Taeweon Suh
                 and Rolf Kassa and Michael Konow",
  title =        "A Desktop Computer with a Reconfigurable
                 {Pentium\reg}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1331897.1331901",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Advancements in reconfigurable technologies,
                 specifically FPGAs, have yielded faster, more
                 power-efficient reconfigurable devices with enormous
                 capacities. In our work, we provide testament to the
                 impressive capacity of recent FPGAs by hosting a
                 complete Pentium$^{{\reg }}$ in a single FPGA chip. In
                 addition we demonstrate how FPGAs can be used for
                 microprocessor design space exploration while
                 overcoming the tension between simulation speed, model
                 accuracy, and model completeness found in traditional
                 software simulator environments. Specifically, we
                 perform preliminary experimentation/prototyping with an
                 original Socket 7 based desktop processor system with
                 typical hardware peripherals running modern operating
                 systems such as Fedora Core 4 and Windows XP; however
                 we have inserted a Xilinx Virtex-4 in place of the
                 processor that should sit in the motherboard and have
                 used the Virtex-4 to host a complete version of the
                 Pentium$^{{\reg }}$ microprocessor (which consumes less
                 than half its resources). We can therefore apply
                 architectural changes to the processor and evaluate
                 their effects on the complete desktop system. We use
                 this FPGA-based emulation system to conduct preliminary
                 architectural experiments including growing the branch
                 target buffer and the level 1 caches. In addition, we
                 experimented with interfacing hardware accelerators
                 such as DES and AES engines which resulted in a 27x
                 speedup.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "accelerator; architecture; emulator; exploration;
                 FPGA; model; operating system; Pentium processor;
                 reconfigurable; simulator",
}

@Article{Feng:2008:DEI,
  author =       "Wenyi Feng and Sinan Kaptanoglu",
  title =        "Designing Efficient Input Interconnect Blocks for
                 {LUT} Clusters Using Counting and Entropy",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1331897.1331902",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In a cluster-based FPGA, the interconnect from
                 external routing tracks and cluster feedbacks to the
                 LUT inputs consumes significant area, and no consensus
                 has emerged among different implementations (e.g.,
                 1-level or 2-level). In this paper, we model this
                 interconnect as a unified input interconnect block
                 (IIB). We identify three types of IIBs and develop
                 general combinatorial techniques to count the number of
                 distinct functional configurations for them. We use
                 entropy, defined as the logarithm of this count, to
                 estimate an IIB's routing flexibility. This enables us
                 to analytically evaluate different IIBs without the
                 customary time-consuming place and route experiments.
                 We show that both depopulated 1-level IIBs and
                 VPR-style 2-level IIBs achieve high routing flexibility
                 but lack area efficiency. We propose a novel class of
                 highly efficient, yet still simple, IIBs that use
                 substantially fewer switches with only a small
                 degradation in routing flexibility. Experimental
                 results verify the routability of these IIBs, and
                 confirm that entropy is a good predictor of
                 routability.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "cluster; counting; entropy; FPGAs; interconnect; LUT;
                 PLDs",
}

@Article{Wilton:2008:SDO,
  author =       "Steven J. E. Wilton and Chun Hok Ho and Bradley
                 Quinton and Philip H. W. Leong and Wayne Luk",
  title =        "A Synthesizable Datapath-Oriented Embedded {FPGA}
                 Fabric for Silicon Debug Applications",
  journal =      j-TRETS,
  volume =       "1",
  number =       "1",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1331897.1331903",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:41 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We present an architecture for a synthesizable
                 datapath-oriented FPGA core that can be used to provide
                 post-fabrication flexibility to an SoC. Our
                 architecture is optimized for bus-based operations and
                 employs a directional routing architecture, which
                 allows it to be synthesized using standard ASIC design
                 tools and flows. The primary motivation for this
                 architecture is to provide an efficient mechanism to
                 support on-chip debugging. The fabric can also be used
                 to implement other datapath-oriented circuits such as
                 those needed in signal processing and
                 computation-intensive applications. We evaluate our
                 architecture using a set of benchmark circuits and
                 compare it to previous fabrics in terms of area, speed,
                 and power.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "Field programmable gate array; integrated circuit;
                 silicon debug; system-on-chip",
}

@Article{Guneysu:2008:SPH,
  author =       "Tim G{\"u}neysu and Christof Paar and Jan Pelzl",
  title =        "Special-Purpose Hardware for Solving the Elliptic
                 Curve Discrete Logarithm Problem",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1371579.1371580",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The resistance against powerful index-calculus attacks
                 makes Elliptic Curve Cryptosystems (ECC) an interesting
                 alternative to conventional asymmetric cryptosystems,
                 like RSA. Operands in ECC require significantly less
                 bits at the same level of security, resulting in a
                 higher computational efficiency compared to RSA. With
                 growing computational capabilities and continuous
                 technological improvements over the years, however, the
                 question of the security of ECC against attacks based
                 on special-purpose hardware arises. In this context,
                 recently emerged low-cost FPGAs demand for attention in
                 the domain of hardware-based cryptanalysis: the
                 extraordinary efficiency of modern programmable
                 hardware devices allow for a low-budget implementation
                 of hardware-based ECC attacks---without the requirement
                 of the expensive development of ASICs.\par

                 With focus on the aspect of cost-efficiency, this
                 contribution presents and analyzes an FPGA-based
                 architecture of an attack against ECC over prime
                 fields. A multi-processing hardware architecture for
                 Pollard's Rho method is described. We provide results
                 on actually used key lengths of ECC (128 bits and
                 above) and estimate the expected runtime for a
                 successful attack.\par

                 As a first result, currently used elliptic curve
                 cryptosystems with a security of 160 bit and above turn
                 out to be infeasible to break with available
                 computational and financial resources. However, some of
                 the security standards proposed by the Standards for
                 Efficient Cryptography Group (SECG) become subject to
                 attacks based on low-cost FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "cryptanalysis; discrete logarithm; elliptic curve
                 cryptosystem; Pollard's rho",
}

@Article{Jacob:2008:MBA,
  author =       "Arpith Jacob and Joseph Lancaster and Jeremy Buhler
                 and Brandon Harris and Roger D. Chamberlain",
  title =        "{Mercury BLASTP}: Accelerating Protein Sequence
                 Alignment",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1371579.1371581",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Large-scale protein sequence comparison is an
                 important but compute-intensive task in molecular
                 biology. BLASTP is the most popular tool for
                 comparative analysis of protein sequences. In recent
                 years, an exponential increase in the size of protein
                 sequence databases has required either exponentially
                 more running time or a cluster of machines to keep
                 pace. To address this problem, we have designed and
                 built a high-performance FPGA-accelerated version of
                 BLASTP, {\em Mercury BLASTP}. In this article, we
                 describe the architecture of the portions of the
                 application that are accelerated in the FPGA, and we
                 also describe the integration of these FPGA-accelerated
                 portions with the existing BLASTP software. We have
                 implemented Mercury BLASTP on a commodity workstation
                 with two Xilinx Virtex-II 6000 FPGAs. We show that the
                 new design runs 11--15 times faster than software
                 BLASTP on a modern CPU while delivering close to 99\%
                 identical results.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "bioinformatics; biological sequence alignment",
}

@Article{Sedcole:2008:PYM,
  author =       "Pete Sedcole and Peter Y. K. Cheung",
  title =        "Parametric Yield Modeling and Simulations of {FPGA}
                 Circuits Considering Within-Die Delay Variations",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1371579.1371582",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Variations in the semiconductor fabrication process
                 results in differences in parameters between
                 transistors on the same die, a problem exacerbated by
                 lithographic scaling. Field-Programmable Gate Arrays
                 may be able to compensate for within-die delay
                 variability, by judicious use of reconfigurability.
                 This article presents two strategies for compensating
                 within-die stochastic delay variability by using
                 reconfiguration: reconfiguring the entire FPGA, and
                 relocating subcircuits within an FPGA. Analytical
                 models for the theoretical bounds on the achievable
                 gains are derived for both strategies and compared to
                 models for worst-case design as well as statistical
                 static timing analysis (SSTA). All models are validated
                 by comparison to circuit-level Monte Carlo simulations.
                 It is demonstrated that significant improvements in
                 circuit yield and timing are possible using SSTA alone,
                 and these improvements can be enhanced by employing
                 reconfiguration-based techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "delay; FPGA; modeling; process variation;
                 reconfiguration; statistical theory; within-die
                 variability; yield",
}

@Article{Gorjiara:2008:MDC,
  author =       "Bita Gorjiara and Mehrdad Reshadi and Daniel Gajski",
  title =        "Merged Dictionary Code Compression for {FPGA}
                 Implementation of Custom Microcoded {PEs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1371579.1371583",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Horizontal Microcoded Architecture (HMA) is a paradigm
                 for designing programmable high-performance processing
                 elements (PEs). However, it suffers from large code
                 size, which can be addressed by compression. In this
                 article, we study the code size of one of the new
                 HMA-based technologies called No-Instruction-Set
                 Computer (NISC). We show that NISC code size can be
                 several times larger than a typical RISC processor, and
                 we propose several low-overhead dictionary-based code
                 compression techniques to reduce its code size. Our
                 compression algorithm leverages the knowledge of
                 ``don't care'' values in the control words and can
                 reduce the code size by 3.3 times, on average. Despite
                 such good results, as shown in this article, these
                 compression techniques lead to poor FPGA
                 implementations because they require many on-chip RAMs.
                 To address this issue, we introduce an FPGA-aware
                 dictionary-based technique that uses the dual-port
                 feature of on-chip RAMs to reduce the number of
                 utilized block RAMs by half. Additionally, we propose
                 cascading two-levels of dictionaries for code size and
                 block RAM reduction of large programs. For an MP3
                 application, a merged, cascaded, three-dictionary
                 implementation reduces the number of utilized block
                 RAMs by 4.3 times (76\%) compared to a NISC without
                 compression. This corresponds to 20\% additional
                 savings over the best single level dictionary-based
                 compression.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "dictionary based compression; FPGA; memory
                 optimization; microcoded architectures;
                 no-instruction-set computer",
}

@Article{Thomas:2008:MGR,
  author =       "David B. Thomas and Wayne Luk",
  title =        "Multivariate {Gaussian} Random Number Generation
                 Targeting Reconfigurable Hardware",
  journal =      j-TRETS,
  volume =       "1",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1371579.1371584",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:42 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/prng.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The multivariate Gaussian distribution is often used
                 to model correlations between stochastic time-series,
                 and can be used to explore the effect of these
                 correlations across $N$ time-series in Monte-Carlo
                 simulations. However, generating random correlated
                 vectors is an $ O(N^2) $ process, and quickly becomes a
                 computational bottleneck in software simulations. This
                 article presents an efficient method for generating
                 vectors in parallel hardware, using $N$ parallel
                 pipelined components to generate a new vector every $N$
                 cycles. This method maps well to the embedded block
                 RAMs and multipliers in contemporary FPGAs,
                 particularly as extensive testing shows that the
                 limited bit-width arithmetic does not reduce the
                 statistical quality of the generated vectors. An
                 implementation of the architecture in the Virtex-4
                 architecture achieves a 500MHz clock-rate, and can
                 support vector lengths up to 512 in the largest
                 devices. The combination of a high clock-rate and
                 parallelism provides a significant performance
                 advantage over conventional processors, with an
                 xc4vsx55 device at 500MHz providing a 200 times speedup
                 over an Opteron 2.6GHz using an AMD optimised BLAS
                 package. In a case study in Delta-Gamma Value-at Risk,
                 an RC2000 accelerator card using an xc4vsx55 at 400MHz
                 is 26 times faster than a quad Opteron 2.6GHz SMP.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "FPGA; multivariate Gaussian distribution; random
                 numbers",
}

@Article{Lamoureux:2008:TBP,
  author =       "Julien Lamoureux and Steven J. E. Wilton",
  title =        "On the trade-off between power and flexibility of
                 {FPGA} clock networks",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1391732.1391733",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA clock networks consume a significant amount of
                 power, since they toggle every clock cycle and must be
                 flexible enough to implement the clocks for a wide
                 range of different applications. The efficiency of FPGA
                 clock networks can be improved by reducing this
                 flexibility; however, reducing the flexibility
                 introduces stricter constraints during the clustering
                 and placement stages of the FPGA CAD flow. These
                 constraints can reduce the overall efficiency of the
                 final implementation. This article examines the
                 trade-off between the power consumption and flexibility
                 of FPGA clock networks.\par

                 Specifically, this article makes three contributions.
                 First, it presents a new parameterized clock-network
                 framework for describing and comparing FPGA clock
                 networks. Second, it describes new clock-aware
                 placement techniques that are needed to find a legal
                 placement satisfying the constraints imposed by the
                 clock network. Finally, it performs an empirical study
                 to examine the trade-off between the power consumption
                 of the clock network and the impact of the CAD
                 constraints for a number of different clock networks
                 with varying amounts of flexibility.\par

                 The results show that the techniques used to produce a
                 legal placement can have a significant influence on
                 power and the ability of the placer to find a legal
                 solution. On average, circuits placed using the most
                 effective techniques dissipate 5\% less overall energy
                 and are significantly more likely to be legal than
                 circuits placed using other techniques. Moreover, the
                 results show that the architecture of the clock network
                 is also important. On average, FPGAs with an efficient
                 clock network are up to 14.6\% more energy efficient
                 compared to other FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "clock distribution networks; clock-aware placement;
                 FPGA; low-power design",
}

@Article{Slogsnat:2008:OSH,
  author =       "David Slogsnat and Alexander Giese and Mondrian
                 N{\"u}ssle and Ulrich Br{\"u}ning",
  title =        "An open-source {HyperTransport} core",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1391732.1391734",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents the design of a generic
                 HyperTransport (HT) core. HyperTransport is a
                 packet-based interconnect technology for low-latency,
                 high-bandwidth point-to-point connections. It is
                 specially optimized to achieve a very low latency. The
                 core has been verified in system using an FPGA. This
                 exhaustive verification and the generic design allow
                 the mapping to both ASICs and FPGAs. The implementation
                 described in this work supports a 16-bit link width, as
                 used by Opteron processors. On a Xilinx Virtex-4 FX60,
                 the core supports a link frequency of 400 MHz DDR and
                 offers a maximum bidirectional bandwidth of 3.2GB/s.
                 The in-system verification has been performed using a
                 custom FPGA board that has been plugged into a
                 HyperTransport extension connector (HTX) of a standard
                 Opteron-based motherboard. HTX slots in Opteron-based
                 motherboards allow very high-bandwidth, low-latency
                 communication, since the HTX device is directly
                 connected to one of the HyperTransport links of the
                 processor. Performance analysis shows a unidirectional
                 payload bandwidth of 1.4GB/s and a read latency of 180
                 ns. The HT core in combination with the HTX board is an
                 ideal base for prototyping systems and implementing
                 FPGA coprocessors. The HT core is available as open
                 source.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "FPGA; HTX; HyperTransport; prototyping; RTL",
}

@Article{Beeckler:2008:PGR,
  author =       "John S. Beeckler and Warren J. Gross",
  title =        "Particle graphics on reconfigurable hardware",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1391732.1391735",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Particle graphics simulations are well suited for
                 modeling complex phenomena such as water, cloth,
                 explosions, fire, smoke, and clouds. They are normally
                 realized in software as part of an interactive graphics
                 application. The computational complexity of particle
                 graphics simulations restricts the number of particles
                 that can be updated in software at interactive frame
                 rates. This article presents the design and
                 implementation of a hardware particle graphics engine
                 for accelerating real-time particle graphics
                 simulations. We explore the design process,
                 implementation issues, and limitations of using
                 field-programmable gate arrays (FPGAs) for the
                 acceleration of particle graphics. The FPGA particle
                 engine processes million-particle systems at a rate
                 from 47 to 112 million particles per second, which
                 represents one to two orders of magnitude speedup over
                 a 2.8 GHz CPU. Using three FPGAs, a maximum sustained
                 performance of 112 million particles per second was
                 achieved.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "FPGAs; particle systems; reconfigurable computing;
                 special-purpose architectures",
}

@Article{Grant:2008:PMS,
  author =       "David Grant and Guy Lemieux",
  title =        "Perturb $+$ mutate: Semisynthetic circuit generation
                 for incremental placement and routing",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1391732.1391736",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "CAD tool designers are always searching for more
                 benchmark circuits to stress their software. In this
                 article we present a heuristic method to generate
                 benchmark circuits specially suited for incremental
                 place-and-route tools. The method removes part of a
                 real circuit and replaces it with an altered version of
                 the same circuit to mimic an incremental design change.
                 The alteration consists of two steps: {\em mutate\/}
                 followed by {\em perturb}. The perturb step exactly
                 preserves as many circuit characteristics as possible.
                 While perturbing, reproduction of interconnect
                 locality, a characteristic that is difficult to measure
                 reliably or reproduce exactly, is controlled using a
                 new technique, {\em ancestor depth control\/} (ADC).
                 Perturbing with ADC produces circuits with postrouting
                 properties that match the best techniques known
                 to-date. The mutate step produces targetted mutations
                 resulting in controlled changes to specific circuit
                 properties (while keeping other properties constant).
                 We demonstrate one targetted mutation heuristic, scale,
                 to significantly change circuit size with little change
                 to other circuit characteristics. The method is simple
                 enough for inclusion in a CAD tool directly, and fast
                 enough for use in on-the-fly benchmark generation.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "automated development tools; design automation; graph
                 algorithms; hardware-supporting software; place and
                 route; testing",
}

@Article{Hsiung:2008:PSB,
  author =       "Pao-Ann Hsiung and Chao-Sheng Lin and Chih-Feng Liao",
  title =        "{Perfecto}: a {SystemC}-based design-space exploration
                 framework for dynamically reconfigurable
                 architectures",
  journal =      j-TRETS,
  volume =       "1",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2008",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1391732.1391737",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 4 17:12:44 MST 2008",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "To cope with increasing demands for higher
                 computational power and greater system flexibility,
                 dynamically and partially reconfigurable logic has
                 started to play an important role in embedded systems
                 and systems-on-chip (SoC). However, when using
                 traditional design methods and tools, it is difficult
                 to estimate or analyze the performance impact of
                 including such reconfigurable logic devices into a
                 system design. In this work, we present a system-level
                 framework, called Perfecto, which is able to perform
                 rapid exploration of different reconfigurable design
                 alternatives and to detect system performance
                 bottlenecks. This framework is based on the popular
                 IEEE standard system-level design language SystemC,
                 which is supported by most EDA and ESL tools. Given an
                 architecture model and an application model, Perfecto
                 uses SystemC {\em transaction-level models\/} (TLMs) to
                 simulate the system design alternatives automatically.
                 Different hardware-software copartitioning,
                 coscheduling, and placement algorithms can be embedded
                 into the framework for analysis; thus, Perfecto can
                 also be used to design the algorithms to be used in an
                 operating system for reconfigurable systems.
                 Applications to a simple illustration example and a
                 network security system have shown how Perfecto helps a
                 designer make intelligent partition decisions, optimize
                 system performance, and evaluate task placements.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "design-space exploration; partitioning; performance
                 evaluation; placement; reconfigurable systems;
                 scheduling",
}

@Article{Chin:2009:SDM,
  author =       "Scott Y. L. Chin and Steven J. E. Wilton",
  title =        "Static and Dynamic Memory Footprint Reduction for
                 {FPGA} Routing Algorithms",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1462586.1462587",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article presents techniques to reduce the static
                 and dynamic memory requirements of routing algorithms
                 that target field-programmable gate arrays. During
                 routing, memory is required to store both architectural
                 data and temporary routing data. The architectural data
                 is static, and provides a representation of the
                 physical routing resources and programmable connections
                 on the device. We show that by taking advantage of the
                 regularity in FPGAs, we can reduce the amount of
                 information that must be explicitly represented,
                 leading to significant memory savings. The temporary
                 routing data is dynamic, and contains scoring
                 parameters and traceback information for each routing
                 resource in the FPGA. By studying the lifespan of the
                 temporary routing data objects, we develop several
                 memory management schemes to reduce this component. To
                 make our proposals concrete, we applied them to the
                 routing algorithm in VPR and empirically quantified the
                 impact on runtime memory footprint, and place and route
                 time.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "CAD; FPGA; memory; routing; scalability",
}

@Article{Xu:2009:FAR,
  author =       "Ning-Yi Xu and Xiong-Fei Cai and Rui Gao and Lei Zhang
                 and Feng-Hsiung Hsu",
  title =        "{FPGA} Acceleration of {RankBoost} in {Web} Search
                 Engines",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "19:1--19:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1462586.1462588",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Search relevance is a key measurement for the
                 usefulness of search engines. Shift of search relevance
                 among search engines can easily change a search
                 company's market cap by tens of billions of dollars.
                 With the ever-increasing scale of the Web, machine
                 learning technologies have become important tools to
                 improve search relevance ranking. RankBoost is a
                 promising algorithm in this area, but it is not widely
                 used due to its long training time. To reduce the
                 computation time for RankBoost, we designed a
                 FPGA-based accelerator system and its upgraded version.
                 The accelerator, plugged into a commodity PC, increased
                 the training speed on MSN search engine data up to
                 1800x compared to the original software implementation
                 on a server. The proposed accelerator has been
                 successfully used by researchers in the search
                 relevance ranking.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "FPGA; hardware acceleration",
}

@Article{Patterson:2009:STP,
  author =       "C. D. Patterson and S. W. Ellingson and B. S. Martin
                 and K. Deshpande and J. H. Simonetti and M. Kavic and
                 S. E. Cutchin",
  title =        "Searching for Transient Pulses with the {ETA} Radio
                 Telescope",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "20:1--20:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1462586.1462589",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Array-based, direct-sampling radio telescopes have
                 computational and communication requirements unsuited
                 to conventional computer and cluster architectures.
                 Synchronization must be strictly maintained across a
                 large number of parallel data streams, from A/D
                 conversion, through operations such as beamforming, to
                 dataset recording. FPGAs supporting multigigabit serial
                 I/O are ideally suited to this application. We describe
                 a recently-constructed radio telescope called ETA
                 having all-sky observing capability for detecting low
                 frequency pulses from transient events such as gamma
                 ray bursts and primordial black hole explosions.
                 Signals from 24 dipole antennas are processed by a
                 tiered arrangement of 28 commercial FPGA boards and 4
                 PCs with FPGA-based data acquisition cards, connected
                 with custom I/O adapter boards supporting InfiniBand
                 and LVDS physical links. ETA is designed for unattended
                 operation, allowing configuration and recording to be
                 controlled remotely.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "Direct sampling radio telescope array; FPGA cluster
                 computing; RFI mitigation; signal dedispersion",
}

@Article{El-Araby:2009:EPR,
  author =       "Esam El-Araby and Ivan Gonzalez and Tarek El-Ghazawi",
  title =        "Exploiting Partial Runtime Reconfiguration for
                 High-Performance Reconfigurable Computing",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "21:1--21:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1462586.1462590",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Runtime Reconfiguration (RTR) has been traditionally
                 utilized as a means for exploiting the flexibility of
                 High-Performance Reconfigurable Computers (HPRCs).
                 However, the RTR feature comes with the cost of high
                 configuration overhead which might negatively impact
                 the overall performance. Currently, modern FPGAs have
                 more advanced mechanisms for reducing the configuration
                 overheads, particularly Partial Runtime Reconfiguration
                 (PRTR). It has been perceived that PRTR on HPRC systems
                 can be the trend for improving the performance. In this
                 work, we will investigate the potential of PRTR on HPRC
                 by formally analyzing the execution model and
                 experimentally verifying our analytical findings by
                 enabling PRTR for the first time, to the best of our
                 knowledge, on one of the current HPRC systems, Cray
                 XD1. Our approach is general and can be applied to any
                 of the available HPRC systems. The paper will conclude
                 with recommendations and conditions, based on our
                 conceptual and experimental work, for the optimal
                 utilization of PRTR as well as possible future usage in
                 HPRC.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "dynamic partial reconfiguration; field programmable
                 gate arrays (FPGA); High performance computing;
                 reconfigurable computing",
}

@Article{Holland:2009:RRA,
  author =       "Brian Holland and Karthik Nagarajan and Alan D.
                 George",
  title =        "{RAT}: {RC} Amenability Test for Rapid Performance
                 Prediction",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "22:1--22:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1462586.1462591",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "While the promise of achieving speedup and additional
                 benefits such as high performance per watt with FPGAs
                 continues to expand, chief among the challenges with
                 the emerging paradigm of reconfigurable computing is
                 the complexity in application design and
                 implementation. Before a lengthy development effort is
                 undertaken to map a given application to hardware, it
                 is important that a high-level parallel algorithm
                 crafted for that application first be analyzed relative
                 to the target platform, so as to ascertain the
                 likelihood of success in terms of potential speedup.
                 This article presents the RC Amenability Test, or RAT,
                 a methodology and model developed for this purpose,
                 supporting rapid exploration and prediction of
                 strategic design tradeoffs during the formulation stage
                 of application development.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "formulation methodology; FPGA; performance prediction;
                 reconfigurable computing; strategic design
                 methodology",
}

@Article{Murtaza:2009:CBB,
  author =       "S. Murtaza and A. G. Hoekstra and P. M. A. Sloot",
  title =        "Compute Bound and {I/O} Bound Cellular Automata
                 Simulations on {FPGA} Logic",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "23:1--23:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1462586.1462592",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA-based computation engines have been used as
                 Cellular Automata accelerators in the scientific
                 community for some time now. With the recent
                 availability of more advanced FPGA logic it becomes
                 necessary to better understand the mapping of Cellular
                 Automata to these systems. There are many trade-offs to
                 consider when mapping a Cellular Automata algorithm
                 from an abstract system to the physical implementation
                 using FPGA logic. The trade-offs include both the
                 available FPGA resources and the Cellular Automata
                 algorithm's execution time. The most important aspect
                 is to fully understand the behavior of the specified CA
                 algorithm in terms of its execution times which are
                 either compute bound or I/O bound. In this article, we
                 present a methodology to categorize a specified CA
                 algorithm as a compute bound or an I/O bound. We take
                 the methodology further by presenting rigorous analysis
                 for each of the two cases identifying the various
                 parameters that control the mapping process and are
                 defined both by the Cellular Automata algorithm and the
                 given FPGA hardware specifications. This methodology
                 helps to predict the performance of running Cellular
                 Automata algorithms on specific FPGA hardware and to
                 determine optimal values for the various parameters
                 that control the mapping process. The model is
                 validated for both compute and I/O bound
                 two-dimensional Cellular Automata algorithms. We find
                 that our model predictions are accurate within 7\%.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "cellular automata; FPGA-based hardware accelerator;
                 High-performance computing; lattice Boltzman
                 simulations",
}

@Article{Bouganis:2009:SOF,
  author =       "Christos-S. Bouganis and Sung-Boem Park and George A.
                 Constantinides and Peter Y. K. Cheung",
  title =        "Synthesis and Optimization of {$2$D} Filter Designs
                 for Heterogeneous {FPGAs}",
  journal =      j-TRETS,
  volume =       "1",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1462586.1462593",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:01 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Many image processing applications require fast
                 convolution of an image with one or more 2D filters.
                 Field-Programmable Gate Arrays (FPGAs) are often used
                 to achieve this goal due to their fine grain
                 parallelism and reconfigurability. However, the
                 heterogeneous nature of modern reconfigurable devices
                 is not usually considered during design optimization.
                 This article proposes an algorithm that explores the
                 space of possible implementation architectures of 2D
                 filters, targeting the minimization of the required
                 area, by optimizing the usage of the different
                 components in a heterogeneous device. This is achieved
                 by exploring the heterogeneous nature of modern
                 reconfigurable devices using a Singular Value
                 Decomposition based algorithm, which provides an
                 efficient mapping of filter's implementation
                 requirements to the heterogeneous components of modern
                 FPGAs. In the case of multiple 2D filters, the proposed
                 algorithm also exploits any redundancy that exists
                 within each filter and between different filters in the
                 set, leading to designs with minimized area.
                 Experiments with real filter sets from computer vision
                 applications demonstrate an average of up to 38\%
                 reduction in the required area.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "2D filter design; FPGA; reconfigurable logic; Singular
                 Value Decomposition",
}

@Article{Schaumont:2009:GEI,
  author =       "Patrick R. Schaumont and Alex K. Jones and Steve
                 Trimberger",
  title =        "{Guest Editors}' Introduction to Security in
                 Reconfigurable Systems Design",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1502781.1502782",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This special issue on Security in Reconfigurable
                 Systems Design reports on recent research results in
                 the design and implementation of trustworthy
                 reconfigurable systems. Five articles cover topics
                 including power-efficient implementation of public-key
                 cryptography, side-channel analysis of electromagnetic
                 radiation, side-channel resistant design, design of
                 robust unclonable functions on an FPGA, and Trojan
                 detection in an FPGA bitstream.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "physically unclonable function; side-channel resistant
                 design; Trojan; Trustworthy design",
}

@Article{Keller:2009:ECC,
  author =       "Maurice Keller and Andrew Byrne and William P.
                 Marnane",
  title =        "Elliptic Curve Cryptography on {FPGA} for Low-Power
                 Applications",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1502781.1502783",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Elliptic curve cryptography has generated a lot of
                 research interest due to its ability to provide greater
                 security per bit compared to public key systems such as
                 RSA. The designer of an elliptic curve hardware
                 accelerator is faced with many choices at design time,
                 each of which can impact the performance of the
                 accelerator in different ways. There are many examples
                 in the literature of how these design choices can
                 effect the area and/or speed of an elliptic curve
                 hardware accelerator. The effect of design choices on
                 power and energy consumption in elliptic curve hardware
                 has been less well studied. This article studies the
                 effect of design choices on the power and energy
                 consumption of an FPGA-based reconfigurable elliptic
                 curve hardware accelerator. A reconfigurable processor
                 has been used for different system parameters and the
                 power and energy consumption measured. The power and
                 energy results are presented and compared.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "Cryptography; elliptic curves; FPGA; low-power",
}

@Article{McEvoy:2009:IWH,
  author =       "Robert P. McEvoy and Colin C. Murphy and William P.
                 Marnane and Michael Tunstall",
  title =        "Isolated {WDDL}: a Hiding Countermeasure for
                 Differential Power Analysis on {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1502781.1502784",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Security protocols are frequently accelerated by
                 implementing the underlying cryptographic functions in
                 reconfigurable hardware. However, unprotected hardware
                 implementations are susceptible to side-channel
                 attacks, and Differential Power Analysis (DPA) has been
                 shown to be especially powerful. In this work, we
                 evaluate and compare the effectiveness of common hiding
                 countermeasures against DPA in FPGA-based designs,
                 using the Whirlpool hash function as a case study. In
                 particular, we develop a new design flow called
                 Isolated WDDL (IWDDL). In contrast with previous works,
                 IWDDL isolates the direct and complementary circuit
                 paths, and also provides DPA resistance in the Hamming
                 distance power model. The analysis is supported using
                 actual implementation results.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "DPA; FPGA; secure logic; Side-channel attacks;
                 Whirlpool",
}

@Article{Sauvage:2009:ERF,
  author =       "Laurent Sauvage and Sylvain Guilley and Yves Mathieu",
  title =        "Electromagnetic Radiations of {FPGAs}: High Spatial
                 Resolution Cartography and Attack on a Cryptographic
                 Module",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1502781.1502785",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Since the first announcement of a Side Channel
                 Analysis (SCA) about ten years ago, considerable
                 research has been devoted to studying these attacks on
                 Application Specific Integrated Circuits (ASICs), such
                 as smart cards or TPMs. In this article, we compare
                 power-line attacks with ElectroMagnetic (EM) attacks,
                 specifically targeting Field Programmable Gate Array
                 devices (FPGAs), as they are becoming widely used for
                 sensitive applications involving cryptography.\par

                 We show experimentally that ElectroMagnetic Analysis
                 (EMA) is always faster than the historical Differential
                 Power Analysis (DPA) in retrieving keys of symmetric
                 ciphers. In addition, these analyses prove to be very
                 convenient to conduct, as they are totally
                 non-invasive.\par

                 Research reports indicate that EMA can be conducted
                 globally, typically with macroscopic home-made coils
                 circling the device under attack, with fair results.
                 However, as accurate professional EM antennas are now
                 becoming more accessible, it has become commonplace to
                 carry out EM analyses locally.\par

                 Cartography has been carried out by optical means on
                 circuits realized with technology greater than 250
                 nanometers. Nonetheless, for deep submicron
                 technologies, the feature size of devices that are
                 spied upon is too small to be visible with photographic
                 techniques. In addition, the presence of the 6+
                 metallization layers obviously prevents a direct
                 observation of the layout. Therefore, EM imaging is
                 emerging as a relevant means to discover the underlying
                 device structure.\par

                 In this article, we present the first images of
                 deep-submicron FPGAs. The resolution is not as accurate
                 as photographic pictures: we notably compare the layout
                 of toy design examples placed at the four corners of
                 the FPGAs with the EM images we collected. We observe
                 that EM imaging has the advantage of revealing active
                 regions, which can be useful in locating a particular
                 processor (visible while active---invisible when
                 inactive).\par

                 In the context of EM attacks, we stress that the exact
                 localization of the cryptographic target is not
                 necessary: the coarse resolution we obtain is
                 sufficient. We note that the EM imaging does not reveal
                 the exact layout of the FPGA, but instead directly
                 guides the attacker towards the areas which are leaking
                 the most. We achieve attacks with an accurate sensor,
                 both far from (namely on a SMC capacitor on the board)
                 and close to (namely directly over the FPGA) the
                 encryption co-processor. As compared to the previously
                 published attacks, we report a successful attack on a
                 DES module in fewer than 6,300 measurements, which is
                 currently the best cracking performance against this
                 encryption algorithm implemented in FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "cartography; DPA; EMA; FPGA; SCA; security",
}

@Article{Majzoobi:2009:TDI,
  author =       "Mehrdad Majzoobi and Farinaz Koushanfar and Miodrag
                 Potkonjak",
  title =        "Techniques for Design and Implementation of Secure
                 Reconfigurable {PUFs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1502781.1502786",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Physically unclonable functions (PUFs) provide a basis
                 for many security and digital rights management
                 protocols. PUF-based security approaches have numerous
                 comparative strengths with respect to traditional
                 cryptography-based techniques, including resilience
                 against physical and side channel attacks and
                 suitability for lightweight protocols. However,
                 classical delay-based PUF structures have a number of
                 drawbacks including susceptibility to guessing, reverse
                 engineering, and emulation attacks, as well as
                 sensitivity to operational and environmental
                 variations.\par

                 To address these limitations, we have developed a new
                 set of techniques for FPGA-based PUF design and
                 implementation. We demonstrate how reconfigurability
                 can be exploited to eliminate the stated PUF
                 limitations. We also show how FPGA-based PUFs can be
                 used for privacy protection. Furthermore,
                 reconfigurability enables the introduction of new
                 techniques for PUF testing. The effectiveness of all
                 the proposed techniques is validated using extensive
                 implementations, simulations, and statistical
                 analysis.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "hardware security; physically unclonable functions;
                 process variation; Reconfigurable systems",
}

@Article{Dutt:2009:TBD,
  author =       "Shantanu Dutt and Li Li",
  title =        "Trust-Based Design and Check of {FPGA} Circuits Using
                 Two-Level Randomized {ECC} Structures",
  journal =      j-TRETS,
  volume =       "2",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1502781.1508209",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 1 18:15:27 MDT 2009",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A novel trust-based design method for FPGA circuits
                 that uses error-correcting code (ECC) structures for
                 detecting design tampers (changes, deletion of existing
                 logic, and addition of extradesign logic-like Trojans)
                 is proposed in this article. We determine ECC-based CLB
                 (configuration logic block) parity groups and embed the
                 check CLBs for each parity group in the FPGA circuit.
                 During a trust-checking phase, a Test-Pattern Generator
                 (TPG) and an Output Response Analyzer (ORA), configured
                 in the FPGA, are used to check that each parity group
                 of CLB outputs produce the expected parities. We use
                 two levels of randomization to thwart attempts by an
                 adversary to discover the parity groups and inject
                 tampers that mask each other, or to tamper with the TPG
                 and ORA so that design tampers remain undetected: (a)
                 randomization of the mapping of the ECC parity groups
                 to the CLB array; (b) randomization within each parity
                 group of odd and even parities for different input
                 combinations (classically, all ECC parity groups have
                 even parities across all inputs). These randomizations
                 along with the error-detecting property of the
                 underlying ECC lead to design tampers being uncovered
                 with very high probabilities, as we show both
                 analytically and empirically. We also classify
                 different CLB function structures and impose a parity
                 group selection in which only similarly structured
                 functions are randomly selected to be in the same
                 parity group in order to minimize check function
                 complexity. Using the 2D code as our underlying ECC and
                 its 2-level randomization, our experiments with
                 inserting 1-10 circuit CLB tampers and 1-5 extraneous
                 logic CLBs in two medium-size circuits and a RISC
                 processor circuit implemented on a Xilinx Spartan-3
                 FPGA show promising results of 100\% tamper detection
                 and 0\% false alarms, obtained at a hardware overhead
                 of only 7-10\%.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "Error-correcting codes; FPGAs; masking probability;
                 parity groups; parity randomization; trust checking;
                 trust-based design",
}

@Article{Amano:2009:GEI,
  author =       "Hideharu Amano and Tadao Nakamura",
  title =        "Guest editors' introduction: {ICFPT 2007}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534917",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Zhao:2009:TMB,
  author =       "Weisheng Zhao and Eric Belhaire and Claude Chappert
                 and Bernard Dieny and Guillaume Prenat",
  title =        "{TAS-MRAM}-Based Low-Power High-Speed Runtime
                 Reconfiguration {(RTR) FPGA}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534918",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Koch:2009:HDT,
  author =       "Dirk Koch and Christian Beckhoff and J{\"u}rgen
                 Teich",
  title =        "Hardware Decompression Techniques for {FPGA}-Based
                 Embedded Systems",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534919",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Wong:2009:SMC,
  author =       "Justin S. J. Wong and Pete Sedcole and Peter Y. K.
                 Cheung",
  title =        "Self-Measurement of Combinatorial Circuit Delays in
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534920",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Seetharaman:2009:ASF,
  author =       "G. Seetharaman and B. Venkataramani",
  title =        "Automation Schemes for {FPGA} Implementation of
                 Wave-Pipelined Circuits",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534921",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Yu:2009:VPS,
  author =       "Jason Yu and Christopher Eagleston and Christopher
                 Han-Yu Chou and Maxime Perreault and Guy Lemieux",
  title =        "Vector Processing as a Soft Processor Accelerator",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534922",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Cevrero:2009:FPC,
  author =       "Alessandro Cevrero and Panagiotis Athanasopoulos and
                 Hadi Parandeh-Afshar and Ajay K. Verma and Hosein Seyed
                 Attarzadeh Niaki and Chrysostomos Nicopoulos and Frank
                 K. Gurkaynak and Philip Brisk and Yusuf Leblebici and
                 Paolo Ienne",
  title =        "Field Programmable Compressor Trees: Acceleration of
                 Multi-Input Addition on {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534923",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Jang:2009:WFT,
  author =       "Stephen Jang and Billy Chan and Kevin Chung and Alan
                 Mishchenko",
  title =        "{WireMap}: {FPGA} Technology Mapping for Improved
                 Routability and Enhanced {LUT} Merging",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534924",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Chung:2009:PTS,
  author =       "Eric S. Chung and Michael K. Papamichael and Eriko
                 Nurvitadhi and James C. Hoe and Ken Mai and Babak
                 Falsafi",
  title =        "{ProtoFlex}: Towards Scalable, Full-System
                 Multiprocessor Simulations Using {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1534916.1534925",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:50 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Pellauer:2009:PNP,
  author =       "Michael Pellauer and Muralidaran Vijayaraghavan and
                 Michael Adler and Arvind and Joel Emer",
  title =        "{A}-Port Networks: Preserving the Timed Behavior of
                 Synchronous Systems for Modeling on {FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575774.1575775",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Cong:2009:FBH,
  author =       "Jason Cong and Yi Zou",
  title =        "{FPGA}-Based Hardware Acceleration of Lithographic
                 Aerial Image Simulation",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575774.1575776",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Ahmed:2009:PTV,
  author =       "Taneem Ahmed and Paul D. Kundarewich and Jason H.
                 Anderson",
  title =        "Packing Techniques for {Virtex-5 FPGAs}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575774.1575777",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Parandeh-Afshar:2009:FLC,
  author =       "Hadi Parandeh-Afshar and Philip Brisk and Paolo
                 Ienne",
  title =        "An {FPGA} Logic Cell and Carry Chain Configurable as a
                 6:2 or 7:2 Compressor",
  journal =      j-TRETS,
  volume =       "2",
  number =       "3",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575774.1575778",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:54 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Compton:2009:ISI,
  author =       "Katherine Compton and Roger Woods and Christos
                 Bouganis and Pedro Diniz",
  title =        "Introduction to the Special Issue {ARC'08}",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575779.1575780",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Jin:2009:ERA,
  author =       "Qiwei Jin and David B. Thomas and Wayne Luk and
                 Benjamin Cope",
  title =        "Exploring Reconfigurable Architectures for Tree-Based
                 Option Pricing Models",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575779.1575781",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Angelopoulou:2009:RRT,
  author =       "Maria E. Angelopoulou and Christos-Savvas Bouganis and
                 Peter Y. K. Cheung and George A. Constantinides",
  title =        "Robust Real-Time Super-Resolution on {FPGA} and an
                 Application to Video Enhancement",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575779.1575782",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Lo:2009:SOC,
  author =       "Chia-Tien Dan Lo and Yi-Gang Tai",
  title =        "Space Optimization on Counters for {FPGA}-Based {Perl}
                 Compatible Regular Expressions",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575779.1575783",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Vassiliadis:2009:ADF,
  author =       "Nikolaos Vassiliadis and George Theodoridis and
                 Spiridon Nikolaidis",
  title =        "An Application Development Framework for {ARISE}
                 Reconfigurable Processors",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575779.1575784",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Dragomir:2009:OLU,
  author =       "Ozana Silvia Dragomir and Todor Stefanov and Koen
                 Bertels",
  title =        "Optimal Loop Unrolling and Shifting for Reconfigurable
                 Architectures",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575779.1575785",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Underwood:2009:SSL,
  author =       "Keith D. Underwood and K. Scott Hemmert and Craig D.
                 Ulmer",
  title =        "From Silicon to Science: The Long Road to Production
                 Reconfigurable Supercomputing",
  journal =      j-TRETS,
  volume =       "2",
  number =       "4",
  pages =        "26:1--26:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1575779.1575786",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:46:56 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Roldao:2010:HTF,
  author =       "Antonio Roldao and George A. Constantinides",
  title =        "A High Throughput {FPGA}-Based Floating Point
                 Conjugate Gradient Implementation for Dense Matrices",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1661438.1661439",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Dubois:2010:SMV,
  author =       "David Dubois and Andrew Dubois and Thomas Boorman and
                 Carolyn Connor and Steve Poole",
  title =        "Sparse Matrix-Vector Multiplication on a
                 Reconfigurable Supercomputer with Application",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1661438.1661440",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Drimer:2010:DBP,
  author =       "Saar Drimer and Tim G{\"u}neysu and Christof Paar",
  title =        "{DSPs}, {BRAMs}, and a Pinch of Logic: Extended
                 Recipes for {AES} on {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1661438.1661441",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Koh:2010:CMP,
  author =       "Shannon Koh and Oliver Diessel",
  title =        "Configuration Merging in Point-to-Point Networks for
                 Module-Based {FPGA} Reconfiguration",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1661438.1661442",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Curreri:2010:PAF,
  author =       "John Curreri and Seth Koehler and Alan D. George and
                 Brian Holland and Rafael Garcia",
  title =        "Performance Analysis Framework for High-Level Language
                 Applications in Reconfigurable Computing",
  journal =      j-TRETS,
  volume =       "3",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1661438.1661443",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 16 09:47:03 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Bodily:2010:CSI,
  author =       "John Bodily and Brent Nelson and Zhaoyi Wei and
                 Dah-Jye Lee and Jeff Chase",
  title =        "A Comparison Study on Implementing Optical Flow and
                 Digital Communications on {FPGAs} and {GPUs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754386.1754387",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGA devices have often found use as
                 higher-performance alternatives to programmable
                 processors for implementing computations. Applications
                 successfully implemented on FPGAs typically contain
                 high levels of parallelism and often use simple
                 statically scheduled control and modest arithmetic.
                 Recently introduced computing devices such as
                 coarse-grain reconfigurable arrays, multi-core
                 processors, and graphical processing units promise to
                 significantly change the computational landscape and
                 take advantage of many of the same application
                 characteristics that fit well on FPGAs. One real-time
                 computing task, optical flow, is difficult to apply in
                 robotic vision applications because of its high
                 computational and data rate requirements, and so is a
                 good candidate for implementation on FPGAs and other
                 custom computing architectures. This article reports on
                 a series of experiments mapping a collection of
                 different algorithms onto both an FPGA and a GPU. For
                 two different optical flow algorithms the GPU had
                 better performance, while for a set of digital comm
                 MIMO computations, they had similar performance. In all
                 cases the FPGA implementations required 10x the
                 development time. Finally, a discussion of the two
                 technology's characteristics is given to show they
                 achieve high performance in different ways.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "Digital communications; FPGA; GPU; optical flow;
                 reconfigurable computing",
}

@Article{Papadopoulos:2010:TRM,
  author =       "Konstantinos Papadopoulos and Ioannis Papaefstathiou",
  title =        "{Titan-R}: a Multigigabit Reconfigurable Combined
                 Compression\slash Decompression Unit",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754386.1754388",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Data compression techniques can alleviate bandwidth
                 problems in even multigigabit networks and are
                 especially useful when combined with encryption. This
                 article demonstrates a reconfigurable hardware
                 compressor/decompressor core, the Titan-R, which can
                 compress/decompress data streams at 8.5 Gb/sec, making
                 it the fastest reconfigurable such device ever
                 proposed; the presented full-duplex implementation
                 allows for fully symmetric compression and
                 decompression rates at 8.5 Gbps each. Its compression
                 algorithm is a variation of the most widely used and
                 efficient such scheme, the Lempel--Ziv (LZ) algorithm
                 that uses part of the previous input stream as the
                 dictionary. In order to support this high network
                 throughput, the Titan-R utilizes a very fine-grained
                 pipeline and takes advantage of the high bandwidth
                 provided by the distributed on-chip RAMs of
                 state-of-the-art FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "data compression; FPGA; hardware algorithms;
                 networking; parallel processing; reconfigurable
                 computing; Stream processing",
}

@Article{Badrignans:2010:SSA,
  author =       "Beno{\^\i}t Badrignans and David Champagne and Reouven
                 Elbaz and Catherine Gebotys and Lionel Torres",
  title =        "{SARFUM}: Security Architecture for Remote {FPGA}
                 Update and Monitoring",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754386.1754389",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Remote update of hardware platforms or embedded
                 systems is a convenient service enabled by Field
                 Programmable Gate Array (FPGA)-based systems. This
                 service is often essential in applications like
                 space-based FPGA systems or set-top boxes. However,
                 having the source of the update be remote from the FPGA
                 system opens the door to a set of attacks that may
                 challenge the confidentiality and integrity of the FPGA
                 configuration, the bitstream. Existing schemes propose
                 to encrypt and authenticate the bitstream to thwart
                 these attacks. However, we show that they do not
                 prevent the replay of old bitstream versions, and thus
                 give adversaries an opportunity for downgrading the
                 system. In this article, we propose a new architecture
                 called\par

                 sarfum that, in addition to ensuring bitstream
                 confidentiality and integrity, precludes the replay of
                 old bitstreams. sarfum also includes a protocol for the
                 system designer to remotely monitor the running
                 configuration of the FPGA. Following our presentation
                 and analysis of the security protocols, we propose an
                 example of implementation with the CCM (Counter with
                 CBC-MAC) authenticated encryption standard. We also
                 evaluate the impact of our architecture on the
                 configuration time for different FPGA devices.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "authenticated encryption; bitstream security; FPGA;
                 replay attack; security protocol; system downgrade",
}

@Article{Yoo:2010:IRR,
  author =       "Sang-Kyung Yoo and Deniz Karakoyunlu and Berk Birand
                 and Berk Sunar",
  title =        "Improving the Robustness of Ring Oscillator {TRNGs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754386.1754390",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A ring oscillator-based true-random number generator
                 design (Rings design) was introduced in Sunar et al.
                 [2007]. The design was rigorously analyzed under a
                 simple mathematical model and its performance
                 characteristics were established. In this article we
                 focus on the practical aspects of the Rings design on a
                 reconfigurable logic platform and determine their
                 implications on the earlier analysis framework. We make
                 recommendations for avoiding pitfalls in real-life
                 implementations by considering ring interaction,
                 transistor-level effects, narrow signal rejection,
                 transmission line attenuation, and sampler bias.
                 Furthermore, we present experimental results showing
                 that changing operating conditions such as the power
                 supply voltage or the operating temperature may affect
                 the output quality when the signal is subsampled.
                 Hence, an attacker may shift the operating point via a
                 simple noninvasive influence and easily bias the TRNG
                 output. Finally, we propose modifications to the design
                 which significantly improve its robustness against
                 attacks, alleviate implementation-related problems, and
                 simultaneously improve its area, throughput, and power
                 performance.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "cryptography; Oscillator rings; true random number
                 generators",
}

@Article{Huffmire:2010:SPR,
  author =       "Ted Huffmire and Timothy Levin and Thuy Nguyen and
                 Cynthia Irvine and Brett Brotherton and Gang Wang and
                 Timothy Sherwood and Ryan Kastner",
  title =        "Security Primitives for Reconfigurable Hardware-Based
                 Systems",
  journal =      j-TRETS,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1754386.1754391",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 22 16:00:33 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Computing systems designed using reconfigurable
                 hardware are increasingly composed using a number of
                 different Intellectual Property (IP) cores, which are
                 often provided by third-party vendors that may have
                 different levels of trust. Unlike traditional software
                 where hardware resources are mediated using an
                 operating system, IP cores have fine-grain control over
                 the underlying reconfigurable hardware. To address this
                 problem, the embedded systems community requires novel
                 security primitives that address the realities of
                 modern reconfigurable hardware. In this work, we
                 propose security primitives using ideas centered around
                 the notion of ``moats and drawbridges.'' The primitives
                 encompass four design properties: logical isolation,
                 interconnect traceability, secure reconfigurable
                 broadcast, and configuration scrubbing. Each of these
                 is a fundamental operation with easily understood
                 formal properties, yet they map cleanly and efficiently
                 to a wide variety of reconfigurable devices. We
                 carefully quantify the required overheads of the
                 security techniques on modern FPGA architectures across
                 a number of different applications.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "Advanced Encryption Standard (AES); controlled
                 sharing; enforcement mechanisms; execution monitors;
                 Field Programmable Gate Arrays (FPGAs); hardware
                 security; isolation; memory protection; reference
                 monitors; security policies; security primitives;
                 separation; static analysis; Systems-on-a-Chip (SoCs)",
}

@Article{Hemmert:2010:FEF,
  author =       "K. Scott Hemmert and Keith D. Underwood",
  title =        "Fast, Efficient Floating-Point Adders and Multipliers
                 for {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839481",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Floating-point applications are a growing trend in the
                 FPGA community. As such, it has become critical to
                 create floating-point units optimized for standard FPGA
                 technology. Unfortunately, the FPGA design space is
                 very different from the VLSI design space; thus,
                 optimizations for FPGAs can differ significantly from
                 optimizations for VLSI. In particular, the FPGA
                 environment constrains the design space such that only
                 limited parallelism can be effectively exploited to
                 reduce latency. Obtaining the right balances between
                 clock speed, latency, and area in FPGAs can be
                 particularly challenging. This article presents
                 implementation details for an IEEE-754 standard
                 floating-point adder and multiplier for FPGAs. The
                 designs presented here enable a Xilinx Virtex4 FPGA
                 (-11 speed grade) to achieve 270 MHz IEEE compliant
                 double precision floating-point performance with a
                 9-stage adder pipeline and 14-stage multiplier
                 pipeline. The area requirement is approximately 500
                 slices for the adder and under 750 slices for the
                 multiplier.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "floating point; FPGA; HPC; reconfigurable computing",
}

@Article{Sghaier:2010:IAT,
  author =       "Ahmad Sghaier and Shawki Areibi and Robert Dony",
  title =        "Implementation Approaches Trade-Offs for {WiMax OFDM}
                 Functions on Reconfigurable Platforms",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839482",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This work investigates several approaches for
                 implementing the OFDM functions of the fixed-WiMax
                 standard on reconfigurable platforms. In the first
                 phase, a custom RTL approach, using VHDL, is
                 investigated. The approach shows the capability of a
                 medium-size FPGA to accommodate the OFDM functions of a
                 fixed-WiMax transceiver with only 50\% occupation rate.
                 In the second phase, a high-level approach based on the
                 AccelDSP tool is used and compared to the custom RTL
                 approach. The approach presents an easy flow to
                 transfer MATLAB floating-point code into synthesizable
                 cores. The AccelDSP approach shows an area overhead of
                 10\%, while allowing early architectural exploration
                 and accelerating the design time by a factor of two.
                 However, the performance figure obtained is almost 1/4
                 of that obtained in the custom RTL approach. In the
                 third phase, the Tensilica Xtensa configurable
                 processor is targeted, which presents remarkable
                 figures in terms of power, area, and design time.
                 Comparing the three approaches indicates that the
                 custom RTL approach has the lead in terms of
                 performance. However, both the AccelDSP and the
                 Tensilica Xtensa approaches show fast design time and
                 early architectural exploration capability. In terms of
                 power, the obtained estimation results show that the
                 configurable Xtensa processor approach has the lead,
                 where approximately the total power consumed is about
                 12--15 times less than those results obtained by the
                 other two approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "AccelDSP; ASIP; custom RTL; FPGA; Tensilica; WiMax",
}

@Article{Smith:2010:AFA,
  author =       "Alastair M. Smith and George A. Constantinides and
                 Peter Y. K. Cheung",
  title =        "An Automated Flow for Arithmetic Component Generation
                 in Field-Programmable Gate Arrays",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839483",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "State-of-the-art configurable logic platforms, such as
                 Field-Programmable Gate Arrays (FPGAs), consist of a
                 heterogeneous mixture of different component types.
                 Compared to traditional homogeneous configurable
                 platforms, heterogeneity provides speed and density
                 advantages. This is due to the replacement of
                 inefficient programmable logic and routing with
                 specialized logic and fixed interconnect in components
                 such as memories, embedded processor units, and fused
                 arithmetic units. Given the increasing complexity of
                 these components, this article introduces a method to
                 automatically propose and explore the benefits of
                 different types of fused arithmetic units. The methods
                 are based on common subgraph extraction techniques,
                 meaning that it is possible to explore different
                 subcircuits that occur frequently across a set of
                 benchmarks. A quantitative analysis is performed of the
                 various fused arithmetic circuits identified by our
                 tool, which are then automatically synthesized to an
                 ASIC process, providing a study of the speed and area
                 benefits of the components. The results of this study
                 provide bounds on the performance of heterogeneous
                 FPGAs: by incorporating coarse-grain components which
                 match the specific needs of a set of benchmarks we show
                 that significant improvements in circuit speed and area
                 can be made.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "common subgraph; FPGA; reconfigurable logic",
}

@Article{Moscola:2010:HAR,
  author =       "James Moscola and Ron K. Cytron and Young H. Cho",
  title =        "Hardware-Accelerated {RNA} Secondary-Structure
                 Alignment",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839484",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The search for homologous RNA molecules---sequences of
                 RNA that might behave similarly due to similarity in
                 their physical (secondary) structure---is currently a
                 computationally intensive task. Moreover, RNA sequences
                 are populating genome databases at a pace unmatched by
                 gains in standard processor performance. While software
                 tools such as Infernal can efficiently find homologies
                 among RNA families and genome databases of modest size,
                 the continuous advent of new RNA families and the
                 explosive growth in volume of RNA sequences necessitate
                 a faster approach.\par

                 This work introduces two different architectures for
                 accelerating the task of finding homologous RNA
                 molecules in a genome database. The first architecture
                 takes advantage of the tree-like configuration of the
                 covariance models used to represent the consensus
                 secondary structure of an RNA family and converts it
                 directly into a highly-pipelined processing engine.
                 Results for this architecture show a 24$ \times $
                 speedup over Infernal when processing a small RNA
                 model. It is estimated that the architecture could
                 potentially offer several thousands of times speedup
                 over Infernal on larger models, provided that there are
                 sufficient hardware resources available.\par

                 The second architecture is introduced to address the
                 steep resource requirements of the first architecture.
                 It utilizes a uniform array of processing elements and
                 schedules all of the computations required to scan for
                 an RNA homolog onto those processing elements. The
                 estimated speedup for this architecture over the
                 Infernal software package ranges from just under 20$
                 \times $ to over 2,350$ \times $.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "Bioinformatics; RNA; secondary-structure alignment",
}

@Article{Ben-Asher:2010:RMC,
  author =       "Yosi Ben-Asher and Danny Meisler and Nadav Rotem",
  title =        "Reducing Memory Constraints in Modulo Scheduling
                 Synthesis for {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839485",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In High-Level Synthesis (HLS), extracting parallelism
                 in order to create small and fast circuits is the main
                 advantage of HLS over software execution. Modulo
                 Scheduling (MS) is a technique in which a loop is
                 parallelized by overlapping different parts of
                 successive iterations. This ability to extract
                 parallelism makes MS an attractive synthesis technique
                 for loop acceleration. In this work we consider two
                 problems involved in the use of MS which are central
                 when targeting FPGAs. Current MS scheduling techniques
                 sacrifice execution times in order to meet resource and
                 delay constraints. Let ``ideal'' execution times be the
                 ones that could have been obtained by MS had we ignored
                 resource and delay constraints. Here we pose the
                 opposite problem, which is more suitable for HLS,
                 namely, how to reduce resource constraints without
                 sacrificing the ideal execution time. We focus on
                 reducing the number of memory ports used by the MS
                 synthesis, which we believe is a crucial resource for
                 HLS. In addition to reducing the number of memory ports
                 we consider the need to develop MS techniques that are
                 fast enough to allow interactive synthesis times and
                 repeated applications of the MS to explore different
                 possibilities of synthesizing the circuits. Current
                 solutions for MS synthesis that can handle memory
                 constraints are too slow to support interactive
                 synthesis. We formalize the problem of reducing the
                 number of parallel memory references in every row of
                 the kernel by a novel combinatorial setting. The
                 proposed technique is based on inserting dummy
                 operations in the kernel and by doing so, performing
                 modulo-shift operations such that the maximal number of
                 parallel memory references in a row is reduced.
                 Experimental results suggest improved execution times
                 for the synthesized circuit. The synthesis takes only a
                 few seconds even for large-size loops.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "FPGA; high-level synthesis; memory optimizations;
                 modulo-scheduling",
}

@Article{Wang:2010:VVP,
  author =       "Xiaojun Wang and Miriam Leeser",
  title =        "{VFloat}: a Variable Precision Fixed- and
                 Floating-Point Library for Reconfigurable Hardware",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839486",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Optimal reconfigurable hardware implementations may
                 require the use of arbitrary floating-point formats
                 that do not necessarily conform to IEEE specified
                 sizes. We present a variable precision floating-point
                 library (VFloat) that supports general floating-point
                 formats including IEEE standard formats. Most
                 previously published floating-point formats for use
                 with reconfigurable hardware are subsets of our format.
                 Custom datapaths with optimal bitwidths for each
                 operation can be built using the variable precision
                 hardware modules in the VFloat library, enabling a
                 higher level of parallelism. The VFloat library
                 includes three types of hardware modules for format
                 control, arithmetic operations, and conversions between
                 fixed-point and floating-point formats. The format
                 conversions allow for hybrid fixed- and floating-point
                 operations in a single design. This gives the designer
                 control over a large number of design possibilities
                 including format as well as number range within the
                 same application. In this article, we give an overview
                 of the components in the VFloat library and demonstrate
                 their use in an implementation of the K-means
                 clustering algorithm applied to multispectral satellite
                 images.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "clustering; floating-point; Reconfigurable hardware",
}

@Article{Purnaprajna:2010:RRM,
  author =       "Madhura Purnaprajna and Mario Porrmann and Ulrich
                 Rueckert and Michael Hussmann and Michael Thies and Uwe
                 Kastens",
  title =        "Runtime Reconfiguration of Multiprocessors Based on
                 Compile-Time Analysis",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839487",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In multiprocessors, performance improvement is
                 typically achieved by exploring parallelism with fixed
                 granularities, such as instruction-level, task-level,
                 or data-level parallelism. We introduce a new
                 reconfiguration mechanism that facilitates variations
                 in these granularities in order to optimize resource
                 utilization in addition to performance improvements.
                 Our reconfigurable multiprocessor QuadroCore combines
                 the advantages of reconfigurability and parallel
                 processing. In this article, a unified
                 hardware-software approach for the design of our
                 QuadroCore is presented. This design flow is enabled
                 via compiler-driven reconfiguration which matches
                 application-specific characteristics to a fixed set of
                 architectural variations. A special reconfiguration
                 mechanism has been developed that alters the
                 architecture within a single clock cycle.\par

                 The QuadroCore has been implemented on Xilinx XC2V6000
                 for functional validation and on UMC's 90nm standard
                 cell technology for performance estimation. A diverse
                 set of applications have been mapped onto the
                 reconfigurable multiprocessor to meet orthogonal
                 performance characteristics in terms of time and power.
                 Speedup measurements show a 2--11 times performance
                 increase in comparison to a single processor.
                 Additionally, the reconfiguration scheme has been
                 applied to save power in data-parallel applications.
                 Gate-level simulations have been performed to measure
                 the power-performance trade-offs for two
                 computationally complex applications. The power reports
                 confirm that introducing this scheme of reconfiguration
                 results in power savings in the range of 15--24\%.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "compilation for multiprocessors; Reconfigurable
                 multiprocessors",
}

@Article{Banerjee:2010:BMA,
  author =       "Sudarshan Banerjee and Elaheh Bozorgzadeh and Juanjo
                 Noguera and Nikil Dutt",
  title =        "Bandwidth Management in Application Mapping for
                 Dynamically Reconfigurable Architectures",
  journal =      j-TRETS,
  volume =       "3",
  number =       "3",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://doi.acm.org/10.1145/1839480.1839488",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Oct 8 18:26:34 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Partial dynamic reconfiguration (often referred to as
                 partial RTR) enables true on-demand computing. In an
                 on-demand computing environment, a dynamically invoked
                 application is assigned resources such as data
                 bandwidth, configurable logic. The limited logic
                 resources are customized during application execution
                 by exploiting partial RTR. In this article, we propose
                 an approach that maximizes application performance when
                 available bandwidth and logic resources are limited.
                 Our proposed approach is based on theoretical
                 principles of minimizing application schedule length
                 under bandwidth and logic resource constraints. It
                 includes detailed microarchitectural considerations on
                 a commercially popular reconfigurable device, and it
                 exploits partial RTR very effectively by utilizing
                 data-parallelism property of common image-processing
                 applications. We present extensive application case
                 studies on a cycle-accurate simulation platform that
                 includes detailed resource considerations of the Xilinx
                 Virtex XC2V3000. Our experimental results demonstrate
                 that applying our proposed approach to common
                 image-filtering applications leads to 15--20\%
                 performance gain in scenarios with limited bandwidth,
                 when compared to prior work that also exploits
                 data-parallelism with RTR but includes simpler
                 bandwidth considerations. Last but not the least, we
                 also demonstrate how our proposed theoretical
                 principles can be directly applied to solve related
                 problems such as minimizing schedule length under logic
                 resource and power constraints.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
  keywords =     "bandwidth; Partial RTR; scheduling",
}

@Article{Williams:2010:CFR,
  author =       "Jason Williams and Chris Massie and Alan D. George and
                 Justin Richardson and Kunal Gosrani and Herman Lam",
  title =        "Characterization of Fixed and Reconfigurable
                 Multi-Core Devices for Application Acceleration",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862649",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Huang:2010:RCA,
  author =       "Miaoqing Huang and Vikram K. Narayana and Harald
                 Simmler and Olivier Serres and Tarek El-Ghazawi",
  title =        "Reconfiguration and Communication-Aware Task
                 Scheduling for High-Performance Reconfigurable
                 Computing",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "20:1--20:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862650",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Sano:2010:FAB,
  author =       "Kentaro Sano and Wang Luzhou and Yoshiaki Hatsuda and
                 Takanori Iizuka and Satoru Yamamoto",
  title =        "{FPGA}-Array with Bandwidth-Reduction Mechanism for
                 Scalable and Power-Efficient Numerical Simulations
                 Based on Finite Difference Methods",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "21:1--21:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862651",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Saldana:2010:MPM,
  author =       "Manuel Salda{\~n}a and Arun Patel and Christopher
                 Madill and Daniel Nunes and Danyao Wang and Paul Chow
                 and Ralph Wittig and Henry Styles and Andrew Putnam",
  title =        "{MPI} as a Programming Model for High-Performance
                 Reconfigurable Computers",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "22:1--22:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862652",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Chiu:2010:MDS,
  author =       "Matt Chiu and Martin C. Herbordt",
  title =        "Molecular Dynamics Simulations on High-Performance
                 Reconfigurable Computing Systems",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "23:1--23:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862653",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Montone:2010:PFD,
  author =       "Alessio Montone and Marco D. Santambrogio and
                 Donatella Sciuto and Seda Ogrenci Memik",
  title =        "Placement and Floorplanning in Dynamically
                 Reconfigurable {FPGAs}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "24:1--24:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862654",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Reardon:2010:SFR,
  author =       "Casey Reardon and Eric Grobelny and Alan D. George and
                 Gongyu Wang",
  title =        "A Simulation Framework for Rapid Analysis of
                 Reconfigurable Computing Systems",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "25:1--25:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862655",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Tian:2010:HPQ,
  author =       "Xiang Tian and Khaled Benkrid",
  title =        "High-Performance Quasi-{Monte Carlo} Financial
                 Simulation: {FPGA} vs. {GPP} vs. {GPU}",
  journal =      j-TRETS,
  volume =       "3",
  number =       "4",
  pages =        "26:1--26:??",
  month =        nov,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1862648.1862656",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 23 11:26:33 MST 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Woods:2010:GEA,
  author =       "Roger Woods and J{\"u}rgen Becker and Peter Athanas
                 and Fearghal Morgan",
  title =        "Guest Editorial {ARC 2009}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857928",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Saiprasert:2010:OHA,
  author =       "Chalermpol Saiprasert and Christos-S. Bouganis and
                 George A. Constantinides",
  title =        "An Optimized Hardware Architecture of a Multivariate
                 {Gaussian} Random Number Generator",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857929",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Monte Carlo simulation is one of the most widely used
                 techniques for computationally intensive simulations in
                 mathematical analysis and modeling. A multivariate
                 Gaussian random number generator is one of the main
                 building blocks of such a system. Field Programmable
                 Gate Arrays (FPGAs) are gaining increased popularity as
                 an alternative means to the traditional general purpose
                 processors targeting the acceleration of the
                 computationally expensive random number generator
                 block. This article presents a novel approach for
                 mapping a multivariate Gaussian random number generator
                 onto an FPGA by optimizing the computational path in
                 terms of hardware resource usage subject to an
                 acceptable error in the approximation of the
                 distribution of interest. The proposed approach is
                 based on the eigenvalue decomposition algorithm which
                 leads to a design with different precision requirements
                 in the computational paths. An analysis on the impact
                 of the error due to truncation/rounding operation along
                 the computational path is performed and an analytical
                 expression of the error inserted into the system is
                 presented. Based on the error analysis, three
                 algorithms that optimize the resource utilization and
                 at the same time minimize the error in the output of
                 the system are presented and compared. Experimental
                 results reveal that the hardware resource usage on an
                 FPGA as well as the error in the approximation of the
                 distribution of interest are significantly reduced by
                 the use of the optimization techniques introduced in
                 the proposed approach.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kahoul:2010:EHA,
  author =       "Asma Kahoul and Alastair M. Smith and George A.
                 Constantinides and Peter Y. K. Cheung",
  title =        "Efficient Heterogeneous Architecture Floorplan
                 Optimization using Analytical Methods",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857930",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kepa:2010:DAS,
  author =       "K. Kepa and F. Morgan and K. Ko{\'s}ciuszkiewicz and
                 L. Braun and M. H{\"u}bner and J. Becker",
  title =        "Design Assurance Strategy and Toolset for Partially
                 Reconfigurable {FPGA} Systems",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857931",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Inoue:2010:VGL,
  author =       "Kazuki Inoue and Qian Zhao and Yasuhiro Okamoto and
                 Hiroki Yosho and Motoki Amagasaki and Masahiro Iida and
                 Toshinori Sueyoshi",
  title =        "A Variable-Grain Logic Cell and Routing Architecture
                 for a Reconfigurable {IP} Core",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857932",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Guo:2010:OSC,
  author =       "Xu Guo and Patrick Schaumont",
  title =        "Optimized System-on-Chip Integration of a Programmable
                 {ECC} Coprocessor",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857933",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Sterpone:2010:NTD,
  author =       "Luca Sterpone",
  title =        "A New Timing Driven Placement Algorithm for Dependable
                 Circuits on {SRAM}-based {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "7:1--7:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857934",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Lanuzza:2010:ESR,
  author =       "M. Lanuzza and P. Zicari and F. Frustaci and S. Perri
                 and P. Corsonello",
  title =        "Exploiting Self-Reconfiguration Capability to Improve
                 {SRAM}-based {FPGA} Robustness in Space and Avionics
                 Applications",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857935",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Hsiung:2010:SPH,
  author =       "Pao-Ann Hsiung and Chun-Hsian Huang and Jih-Sheng Shen
                 and Chen-Chi Chiang",
  title =        "Scheduling and Placement of Hardware\slash Software
                 Real-Time Relocatable Tasks in Dynamically Partially
                 Reconfigurable Systems",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857936",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kanazawa:2010:ASL,
  author =       "Kenji Kanazawa and Tsutomu Maruyama",
  title =        "An Approach for Solving Large {SAT} Problems on
                 {FPGA}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857937",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Lu:2010:ERD,
  author =       "Yingxi Lu and Maire O'Neill and John McCanny",
  title =        "Evaluation of Random Delay Insertion against {DPA} on
                 {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "1",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1857927.1857938",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Wed Jan 26 14:58:50 MST 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Bergeron:2011:LTF,
  author =       "Etienne Bergeron and Louis-David Perron and Marc
                 Feeley and Jean Pierre David",
  title =        "Logarithmic-Time {FPGA} Bitstream Analysis: a Step
                 Towards {JIT} Hardware Compilation",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968503",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Vaidya:2011:NMC,
  author =       "Pranav Vaidya and Jaehwan John Lee",
  title =        "A Novel Multicontext Coarse-Grained Reconfigurable
                 Architecture {(CGRA)} For Accelerating Column-Oriented
                 Databases",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968504",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{ONeill:2011:SPM,
  author =       "Shane O'Neill and Roger Francis Woods and Alan James
                 Marshall and Qi Zhang",
  title =        "A Scalable and Programmable Modular Traffic Manager
                 Architecture",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968505",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Nakajima:2011:FOR,
  author =       "Mao Nakajima and Minoru Watanabe",
  title =        "Fast Optical Reconfiguration of a Nine-Context {DORGA}
                 Using a Speed Adjustment Control",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968506",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Tai:2011:POA,
  author =       "Tzu-Chiang Tai and Yen-Tai Lai",
  title =        "A Performance-Oriented Algorithm with Consideration on
                 Communication Cost for Dynamically Reconfigurable
                 {FPGA} Partitioning",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968507",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Demertzi:2011:DSO,
  author =       "Melina Demertzi and Pedro C. Diniz and Mary W. Hall
                 and Anna C. Gilbert and Yi Wang",
  title =        "Domain-Specific Optimization of Signal Recognition
                 Targeting {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968508",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Galuzzi:2011:ISE,
  author =       "Carlo Galuzzi and Koen Bertels",
  title =        "The Instruction-Set Extension Problem: a Survey",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "18:1--18:28",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968509",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Rupnow:2011:SAD,
  author =       "Kyle Rupnow and Keith D. Underwood and Katherine
                 Compton",
  title =        "Scientific Application Demands on a Reconfigurable
                 Functional Unit Interface",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968510",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kaganov:2011:FAM,
  author =       "Alexander Kaganov and Asif Lakhany and Paul Chow",
  title =        "{FPGA} Acceleration of {MultiFactor CDO} Pricing",
  journal =      j-TRETS,
  volume =       "4",
  number =       "2",
  pages =        "20:1--20:??",
  month =        may,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/1968502.1968511",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Jun 7 18:34:54 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Labrecque:2011:ASS,
  author =       "Martin Labrecque and Mark C. Jeffrey and J. Gregory
                 Steffan",
  title =        "Application-specific signatures for transactional
                 memory in soft processors",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000833",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Boland:2011:OMB,
  author =       "David Boland and George A. Constantinides",
  title =        "Optimizing memory bandwidth use and performance for
                 matrix-vector multiplication in iterative methods",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "22:1--22:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000834",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Glaser:2011:TFT,
  author =       "Johann Glaser and Markus Damm and Jan Haase and
                 Christoph Grimm",
  title =        "{TR-FSM}: Transition-Based reconfigurable finite state
                 machine",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000835",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Parvez:2011:ASF,
  author =       "Husain Parvez and Zied Marrakchi and Alp Kilic and
                 Habib Mehrez",
  title =        "Application-Specific {FPGA} using heterogeneous logic
                 blocks",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "24:1--24:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000836",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Yan:2011:FBA,
  author =       "Jing Yan and Ning-Yi Xu and Xiong-Fei Cai and Rui Gao
                 and Yu Wang and Rong Luo and Feng-Hsiung Hsu",
  title =        "An {FPGA}-based accelerator for {LambdaRank} in {Web}
                 search engines",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "25:1--25:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000837",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In modern Web search engines, Neural Network
                 (NN)-based learning to rank algorithms is intensively
                 used to increase the quality of search results.
                 LambdaRank is one such algorithm. However, it is hard
                 to be efficiently accelerated by computer clusters or
                 GPUs, because: (i) the cost function for the ranking
                 problem is much more complex than that of traditional
                 Back-Propagation(BP) NNs, and (ii) no coarse-grained
                 parallelism exists in the algorithm. This article
                 presents an FPGA-based accelerator solution to provide
                 high computing performance with low power consumption.
                 A compact deep pipeline is proposed to handle the
                 complex computing in the batch updating. The area
                 scales linearly with the number of hidden nodes in the
                 algorithm. We also carefully design a data format to
                 enable streaming consumption of the training data from
                 the host computer. The accelerator shows up to 15.3X
                 (with PCIe x4) and 23.9X (with PCIe x8) speedup
                 compared with the pure software implementation on
                 datasets from a commercial search engine.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Aggarwal:2011:SMP,
  author =       "Vikas Aggarwal and Alan D. George and Changil Yoon and
                 Kishore Yalamanchili and Herman Lam",
  title =        "{SHMEM+}: a multilevel-{PGAS} programming model for
                 reconfigurable supercomputing",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000838",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Holland:2011:AMM,
  author =       "Brian Holland and Alan D. George and Herman Lam and
                 Melissa C. Smith",
  title =        "An analytical model for multilevel performance
                 prediction of Multi-{FPGA} systems",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "27:1--27:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000839",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Shannon:2011:LRH,
  author =       "Lesley Shannon and Paul Chow",
  title =        "Leveraging reconfigurability in the hardware\slash
                 software codesign process",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "28:1--28:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000840",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Nava:2011:ADR,
  author =       "Federico Nava and Donatella Sciuto and Marco Domenico
                 Santambrogio and Stefan Herbrechtsmeier and Mario
                 Porrmann and Ulf Witkowski and Ulrich Rueckert",
  title =        "Applying dynamic reconfiguration in the mobile
                 robotics domain: a case study on computer vision
                 algorithms",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "29:1--29:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000841",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Koehler:2011:PAB,
  author =       "Seth Koehler and Greg Stitt and Alan D. George",
  title =        "Platform-aware bottleneck detection for reconfigurable
                 computing applications",
  journal =      j-TRETS,
  volume =       "4",
  number =       "3",
  pages =        "30:1--30:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2000832.2000842",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Aug 30 08:13:57 MDT 2011",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Cheung:2011:ISS,
  author =       "Peter Y. K. Cheung",
  title =        "Introduction to special section {FPGA 2009}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "31:1--31:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068717",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Luu:2011:VFC,
  author =       "Jason Luu and Ian Kuon and Peter Jamieson and Ted
                 Campbell and Andy Ye and Wei Mark Fang and Kenneth Kent
                 and Jonathan Rose",
  title =        "{VPR 5.0}: {FPGA CAD} and architecture exploration
                 tools with single-driver routing, heterogeneity and
                 process scaling",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "32:1--32:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068718",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The VPR toolset has been widely used in FPGA
                 architecture and CAD research, but has not evolved over
                 the past decade. This article describes and illustrates
                 the use of a new version of the toolset that includes
                 four new features: first, it supports a broad range of
                 single-driver routing architectures, which have
                 superior architectural and electrical properties over
                 the prior multidriver approach (and which is now
                 employed in the majority of FPGAs sold). Second, it can
                 now model, for placement and routing a heterogeneous
                 selection of hard logic blocks. This is a key (but not
                 final) step toward the incluion of blocks such as
                 memory and multipliers. Third, we provide optimized
                 electrical models for a wide range of architectures in
                 different process technologies, including a range of
                 area-delay trade-offs for each single architecture.
                 Finally, to maintain robustness and support future
                 development the release includes a set of regression
                 tests for the software. To illustrate the use of the
                 new features, we explore several architectural issues:
                 the FPGA area efficiency versus logic block
                 granularity, the effect of single-driver routing, and a
                 simple use of the heterogeneity to explore the impact
                 of hard multipliers on wiring track count.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Rubin:2011:CYO,
  author =       "Raphael Rubin and Andr{\'e} Dehon",
  title =        "Choose-your-own-adventure routing: Lightweight
                 load-time defect avoidance",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "33:1--33:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068719",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Aggressive scaling increases the number of devices we
                 can integrate per square millimeter but makes it
                 increasingly difficult to guarantee that each device
                 fabricated has the intended operational
                 characteristics. Without careful mitigation, component
                 yield rates will fall, potentially negating the
                 economic benefits of scaling. The fine-grained
                 reconfigurability inherent in FPGAs is a powerful tool
                 that can allow us to drop the stringent requirement
                 that every device be fabricated perfectly in order for
                 a component to be useful. To exploit inherent FPGA
                 reconfigurability while avoiding full CAD mapping, we
                 propose lightweight techniques compatible with the
                 current single bitstream model that can avoid defective
                 devices, reducing yield loss at high defect rates. In
                 particular, by embedding testing operations and
                 alternative path configurations into the bitstream,
                 each FPGA can avoid defects by making only simple,
                 greedy decisions at bitstream load time. With 20\%
                 additional tracks above the minimum routable channel
                 width, routes can tolerate 0.01\% switch and wire
                 defect rates, raising yield from essentially 0\% to
                 near 100\%.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Mishchenko:2011:SDC,
  author =       "Alan Mishchenko and Robert Brayton and Jie-Hong R.
                 Jiang and Stephen Jang",
  title =        "Scalable don't-care-based logic optimization and
                 resynthesis",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "34:1--34:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068720",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We describe an optimization method for combinational
                 and sequential logic networks, with emphasis on
                 scalability. The proposed resynthesis (a) is capable of
                 substantial logic restructuring, (b) is customizable to
                 solve a variety of optimization tasks, and (c) has
                 reasonable runtime on industrial designs. The approach
                 uses don't-cares computed for a window surrounding a
                 node and can take into account external don't-cares
                 (e.g., unreachable states). It uses a SAT solver for
                 all aspects of Boolean manipulation: computing
                 don't-cares for a node in the window, and deriving a
                 new Boolean function of the node after resubstitution.
                 Experimental results on 6-input LUT networks after a
                 high effort synthesis show substantial reductions in
                 area and delay. When applied to 20 large academic
                 benchmarks, the LUT counts and logic levels are reduced
                 by 45.0\% and 12.2\%, respectively. The longest runtime
                 for synthesis and mapping is about two minutes. When
                 applied to a set of 14 industrial benchmarks ranging up
                 to 83K 6-LUTs, the LUT counts and logic levels are
                 reduced by 11.8\% and 16.5\%, respectively. The longest
                 runtime is about 30 minutes.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kennings:2011:FTM,
  author =       "Andrew Kennings and Kristofer Vorwerk and Arun Kundu
                 and Val Pevzner and Andy Fox",
  title =        "{FPGA} technology mapping with encoded libraries and
                 staged priority cuts",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "35:1--35:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068721",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Technology mapping is an important step in the FPGA
                 CAD flow in which a network of simple gates is
                 converted into a network of logic blocks. This article
                 considers enhancements to a traditional LUT-based
                 mapping algorithm for an FPGA comprised of logic blocks
                 which implement only a subset of functions of up to k
                 variables; specifically, the logic block is a partial
                 LUT, but it possesses more inputs than a typical LUT.
                 An analysis of the logic block is presented, and
                 techniques for postmapping area recovery and
                 timing-driven buffer insertion are also described.
                 Numerical results are put forth which substantiate the
                 efficacy of the proposed methods using real circuits
                 mapped to a commercial FPGA architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Papadimitriou:2011:PPR,
  author =       "Kyprianos Papadimitriou and Apostolos Dollas and Scott
                 Hauck",
  title =        "Performance of partial reconfiguration in {FPGA}
                 systems: a survey and a cost model",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "36:1--36:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068722",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Fine-grain reconfigurable devices suffer from the time
                 needed to load the configuration bitstream. Even for
                 small bitstreams in partially reconfigurable FPGAs this
                 time cannot be neglected. In this article we survey the
                 performance of the factors that contribute to the
                 reconfiguration speed. Then, we study an FPGA-based
                 system architecture and with real experiments we
                 produce a cost model of Partial Reconfiguration (PR).
                 This model is introduced to calculate the expected
                 reconfiguration time and throughput. In order to
                 develop a realistic model we take into account all the
                 physical components that participate in the
                 reconfiguration process. We analyze the parameters that
                 affect the generality of the model and the adjustments
                 needed per system for error-free evaluation. We verify
                 it with real measurements, and then we employ it to
                 evaluate existing systems presented in previous
                 publications. The percentage error of the cost model
                 when comparing its results with the actual values of
                 those publications varies from 36\% to 63\%, whereas
                 existing works report differences up to two orders of
                 magnitude. Present work enables a user to evaluate PR
                 and decide whether it is suitable for a certain
                 application prior entering the complex PR design
                 flow.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Chen:2011:EDL,
  author =       "Xiaoheng Chen and Venkatesh Akella",
  title =        "Exploiting data-level parallelism for energy-efficient
                 implementation of {LDPC} decoders and {DCT} on an
                 {FPGA}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "37:1--37:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068723",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We explore the use of Data-Level Parallelism (DLP) as
                 a way of improving the energy efficiency and power
                 consumption involved in running applications on an
                 FPGA. We show that static power consumption is a
                 significant fraction of the overall power consumption
                 in an FPGA and that it does not change significantly
                 even as the area required by an architecture increases,
                 because of the dominance of interconnect in an FPGA. We
                 show that the degree of DLP can be used in conjunction
                 with frequency scaling to reduce the overall power
                 consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Easwaran:2011:NLB,
  author =       "Lakshmi Easwaran and Ali Akoglu",
  title =        "Net-length-based routability-driven power-aware
                 clustering",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "38:1--38:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068724",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The state-of-the-art power-aware clustering tool,
                 P-T-VPack, achieves energy reduction by localizing nets
                 with high switching activity at the expense of channel
                 width and area. In this study, we employ predicted
                 individual postplacement net length information during
                 clustering and prioritize longer nets. This approach
                 targets the capacitance factor for energy reduction,
                 and prioritizes longer nets for channel width and area
                 reduction. We first introduce a new clustering
                 strategy, W-T-VPack, which replaces the switching
                 activity in P-T-VPack with a net length factor. We
                 obtain a 9.87\% energy reduction over T-VPack (3.78\%
                 increase over P-T-VPack), while at the same time
                 completely eliminating P-T-VPack's channel width and
                 area overhead. We then introduce W-P-T-VPack, which
                 combines switching activity and net length factors.
                 W-P-T-VPack achieves 14.26\% energy reduction (0.31\%
                 increase over P-T-VPack), while further improving
                 channel width by up to 12.87\% for different cluster
                 sizes. We investigate the energy performance of
                 routability (channel width)-driven clustering
                 algorithms, and show that W-T-VPack consistently
                 outperforms T-RPack and iRAC by at least 11.23\% and
                 9.07\%, respectively. We conclude that net-length-based
                 clustering is an effective method to concurrently
                 target energy and channel width.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Parandeh-Afshar:2011:CTS,
  author =       "Hadi Parandeh-Afshar and Arkosnato Neogy and Philip
                 Brisk and Paolo Ienne",
  title =        "Compressor tree synthesis on commercial
                 high-performance {FPGAs}",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "39:1--39:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068725",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Compressor trees are a class of circuits that
                 generalizes multioperand addition and the partial
                 product reduction trees of parallel multipliers using
                 carry-save arithmetic. Compressor trees naturally occur
                 in many DSP applications, such as FIR filters, and, in
                 the more general case, their use can be maximized
                 through the application of high-level transformations
                 to arithmetically intensive data flow graphs. Due to
                 the presence of carry-chains, it has long been thought
                 that trees of 2- or 3-input carry-propagate adders are
                 more efficient than compressor trees for FPGA
                 synthesis; however, this is not the case. This article
                 presents a heuristic for FPGA synthesis of compressor
                 trees that outperforms adder trees and exploits
                 carry-chains when possible. The experimental results
                 show that, on average, the use of compressor trees can
                 reduce critical path delay by 33\% and 45\%
                 respectively, compared to adder trees synthesized on
                 the Xilinx Virtex-5 and Altera Stratix III FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Inoue:2011:TCD,
  author =       "Hiroaki Inoue and Junya Yamada and Hideyuki Yoneda and
                 Katsumi Togawa and Masato Motomura and Koichiro
                 Furuta",
  title =        "Test compression for dynamically reconfigurable
                 processors",
  journal =      j-TRETS,
  volume =       "4",
  number =       "4",
  pages =        "40:1--40:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2068716.2068726",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Fri Mar 16 16:20:35 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We present the world's first test compression
                 technique that features automation of compression rules
                 for test time reduction on dynamically reconfigurable
                 processors. Evaluations on an actual 40-nm product show
                 that our technique achieves a 2.7 times compression
                 ratio for original configuration information (better
                 than does GZIP), the peak decompression bandwidth of
                 1.6 GB/s, and 2.7 times shorter test times.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Zick:2012:LCS,
  author =       "Kenneth M. Zick and John P. Hayes",
  title =        "Low-cost sensing with ring oscillator arrays for
                 healthier reconfigurable systems",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133352.2133353",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Electronic systems on a chip increasingly suffer from
                 component variation, voltage noise, thermal hotspots,
                 and other subtle physical phenomena. Systems with
                 reconfigurability have unique opportunities for
                 adapting to such effects. Required, however, are
                 low-cost, fine-grained methods for sensing physical
                 parameters. This article presents powerful, novel
                 approaches to online sensing, including methods for
                 designing compact reconfigurable sensors, low-cost
                 threshold detection, and several enhanced measurement
                 procedures. Together, the approaches help enable
                 systems to autonomously uncover a wealth of physical
                 information. A highly efficient counter and improved
                 ring oscillator are introduced, enabling an entire
                 sensor node in just 8 Virtex-5 LUTs. We describe how
                 variations can be measured in delay, temperature,
                 switching-induced IR drop, and leakage-induced IR drop.
                 We demonstrate the proposed approach with an
                 experimental system based on a Virtex-5, instrumented
                 with over 100 sensors at an overhead of only 1.3\%.
                 Results from thermally controlled experiments provide
                 some surprising insights and illustrate the utility of
                 the approach. Online sensing can help open the door to
                 physically adaptive computing, including fine-grained
                 power, reliability, and health management schemes for
                 systems on a chip.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Michail:2012:EHT,
  author =       "Harris E. Michail and George S. Athanasiou and Vasilis
                 Kelefouras and George Theodoridis and Costas E.
                 Goutis",
  title =        "On the exploitation of a high-throughput {SHA-256
                 FPGA} design for {HMAC}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133352.2133354",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "High-throughput and area-efficient designs of hash
                 functions and corresponding mechanisms for Message
                 Authentication Codes (MACs) are in high demand due to
                 new security protocols that have arisen and call for
                 security services in every transmitted data packet. For
                 instance, IPv6 incorporates the IPSec protocol for
                 secure data transmission. However, the IPSec's
                 performance bottleneck is the HMAC mechanism which is
                 responsible for authenticating the transmitted data.
                 HMAC's performance bottleneck in its turn is the
                 underlying hash function. In this article a
                 high-throughput and small-size SHA-256 hash function
                 FPGA design and the corresponding HMAC FPGA design is
                 presented. Advanced optimization techniques have been
                 deployed leading to a SHA-256 hashing core which
                 performs more than 30\% better, compared to the next
                 better design. This improvement is achieved both in
                 terms of throughput as well as in terms of
                 throughput/area cost factor. It is the first reported
                 SHA-256 hashing core that exceeds 11Gbps (after place
                 and route in Xilinx Virtex 6 board).",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Olivares:2012:RAV,
  author =       "Joaqu{\'\i}n Olivares",
  title =        "Reconfigurable architecture for {VBSME} with variable
                 pixel precision",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133352.2133355",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Current video coding standards, e.g. MPEG-4 H.264/AVC,
                 include Variable Block Size Motion Estimation, in this
                 paper, this process is implemented by a reconfigurable
                 architecture based on Signed Digit arithmetic. Bit
                 serial computation is applied to reconfigure pixel
                 precision. The reconfigurable architectural model is
                 extremely simple to reconfigure. Pixel truncation is
                 used to speed up computation saving up 23.5\% of clock
                 cycles for 4-bit precision. This design allows to
                 process all motion vectors of a block in just one
                 iteration. This system has been implemented in FPGA,
                 and HDTVp results are presented. Main characteristics,
                 of this architecture are: very reduced cost, high
                 performance, and reconfigurable pixel precision, these
                 features could be useful in mobile devices.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Siozios:2012:NFE,
  author =       "Kostas Siozios and Vasilis F. Pavlidis and Dimitrios
                 Soudris",
  title =        "A novel framework for exploring {$3$-D} {FPGAs} with
                 heterogeneous interconnect fabric",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133352.2133356",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A heterogeneous interconnect architecture can be a
                 useful approach for the design of 3-D FPGAs. A
                 methodology to investigate heterogeneous
                 interconnection schemes for 3-D FPGAs under different
                 3-D fabrication technologies is proposed. Application
                 of the proposed methodology on benchmark circuits
                 demonstrates an improvement in delay, power
                 consumption, and total wire-length of approximately
                 41\%, 32\%, and 36\%, respectively, as compared to 2-D
                 FPGAs. These improvements are additional to reducing
                 the number of interlayer connections. The fewer
                 interlayer connections are traded off for a higher
                 yield. An area model to evaluate this trade-off is
                 presented. Results indicate that a heterogeneous 3-D
                 FPGA requires 37\% less area as compared to a
                 homogeneous 3-D FPGA. Consequently, the heterogeneous
                 FPGAs can exhibit a higher manufacturing yield. A
                 design toolset is also developed to support the design
                 and exploration of various performance metrics for the
                 proposed 3-D FPGAs.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Takano:2012:DAA,
  author =       "Shigeyuki Takano",
  title =        "Design and analysis of adaptive processor",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133352.2133357",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "A new computation model called CACHE (Cache
                 Architecture for Configurable Hardware Engine) is
                 proposed in this paper. This model does not require a
                 dedicated host processor and its software to harness
                 the reconfiguration. Autonomous reconfiguration is
                 performed within a working-set of application
                 datapaths. The CACHE model has lots of side effects;
                 caching, resource allocation and assignment, placement
                 and routing, and defragmentation, with a processing
                 array itself and a special register called a
                 working-set register file. The model aims to reduce
                 three major workloads: (1) the processor and
                 application design workload, (2) runtime resource
                 management and scheduling workload, and (3)
                 reconfiguration workload. In order to reduce these
                 workloads, processor architecture is definitely
                 different from traditional computing model and its
                 microprocessor architecture. There are three major
                 ideas to construct the computing system: (1) an on-chip
                 working-set model mainly in order to control load and
                 store of streams, namely to control traffics
                 introducing overheads, (2) an on-chip deadlock
                 properties model mainly in order to manage resources
                 and to continuously configure datapaths corresponding
                 to a working-set window, (3) a cache memory technique
                 to work for these models, the mechanism is equivalent
                 to the working-set window, and the cache memory's
                 procedure is equivalent to resource request,
                 acquirement, and release of deadlock properties. The
                 first model focuses onto streaming applications, for
                 example vector and matrix operations, filters, and so
                 on, which takes coarser grained operations such as
                 integer operations of C-language. Regarding performance
                 compared with DSPs, that comes from constant throughput
                 across different scale of the applications. In
                 addition, extended model, we call Instant model that
                 automatically generates instance of a datapath,
                 outperforms the DSPs. This paper shows its computation
                 model, architecture, low-level design, and analyses
                 about basic characteristics of the execution.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Zhang:2012:PSF,
  author =       "Wei Zhang and Vaughn Betz and Jonathan Rose",
  title =        "Portable and scalable {FPGA}-based acceleration of a
                 direct linear system solver",
  journal =      j-TRETS,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2133352.2133358",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Mar 20 12:12:48 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "FPGAs have the potential to serve as a platform for
                 accelerating many computations including scientific
                 applications. However, the large development cost and
                 short life span for FPGA designs have limited their
                 adoption by the scientific computing community.
                 FPGA-based scientific computing and many kinds of
                 embedded computing could become more practical if there
                 were hardware libraries that were portable to any
                 FPGA-based system with performance that scaled with the
                 size of the FPGA. To illustrate this idea we have
                 implemented one common super-computing library
                 function: the LU factorization method for solving
                 systems of linear equations. This paper describes a
                 method for making the design both portable and scalable
                 that should be illustrative if such libraries are to be
                 built in the future. The design is a software-based
                 generator that leverages both the flexibility of a
                 software programming language and the parameters
                 inherent in an hardware description language. The
                 generator accepts parameters that describe the FPGA
                 capacity and external memory capabilities. We compare
                 the performance of our engine executing on the largest
                 FPGA available at the time of this work (an Altera
                 Stratix III 3S340) to a single processor core
                 fabricated in the same 65nm IC process running a highly
                 optimized software implementation from the processor
                 vendor. For single precision matrices on the order of $
                 10, 000 \times 10, 000 $ elements, the FPGA
                 implementation is 2.2 times faster and the energy
                 dissipated per useful GFLOP operation is a factor of 5
                 times less. For double precision, the FPGA
                 implementation is 1.7 times faster and 3.5 times more
                 energy efficient.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Aggarwal:2012:SFT,
  author =       "Vikas Aggarwal and Greg Stitt and Alan George and
                 Changil Yoon",
  title =        "{SCF}: a Framework for Task-Level Coordination in
                 Reconfigurable, Heterogeneous Systems",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2209285.2209286",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Heterogeneous computing systems comprised of
                 accelerators such as FPGAs, GPUs, and manycore
                 processors coupled with standard microprocessors are
                 becoming an increasingly popular solution for future
                 computing systems due to their higher performance and
                 energy efficiency. Although programming languages and
                 tools are evolving to simplify device-level design,
                 programming such systems is still difficult and
                 time-consuming largely due to system-wide challenges
                 involving communication between heterogeneous devices,
                 which currently require ad hoc solutions. Most
                 communication frameworks and APIs which have dominated
                 parallel application development for decades were
                 developed for homogeneous systems, and hence cannot be
                 directly employed for hybrid systems. To solve this
                 problem, this article presents the System Coordination
                 Framework (SCF), which employs message passing to
                 transparently enable communication between tasks
                 described using different programming tools (and
                 languages), and running on heterogeneous processing
                 devices of systems from domains ranging from embedded
                 systems to High-Performance Computing (HPC) systems. By
                 hiding low-level architectural details of the
                 underlying communication from an application designer,
                 SCF can improve application development productivity,
                 provide higher levels of application portability, and
                 offer rapid design-space exploration of different
                 task/device mappings. In addition, SCF enables custom
                 communication synthesis that exploits mechanisms
                 specific to different devices and platforms, which can
                 provide performance improvements over generic solutions
                 employed previously. Our results indicate a performance
                 improvement of 28$ \times $ and 682$ \times $ by
                 employing FPGA devices for two applications presented
                 in this article, while simultaneously improving the
                 developer productivity by approximately 2.5 to 5 times
                 by using SCF.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Fekete:2012:DDR,
  author =       "S{\'a}ndor P. Fekete and Tom Kamphans and Nils Schweer
                 and Christopher Tessars and Jan C. van der Veen and
                 Josef Angermeier and Dirk Koch and J{\"u}rgen Teich",
  title =        "Dynamic Defragmentation of Reconfigurable Devices",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2209285.2209287",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We propose a new method for defragmenting the module
                 layout of a reconfigurable device, enabled by a novel
                 approach for dealing with communication needs between
                 relocated modules and with inhomogeneities found in
                 commonly used FPGAs. Our method is based on dynamic
                 relocation of module positions during runtime, with
                 only very little reconfiguration overhead; the
                 objective is to maximize the length of contiguous free
                 space that is available for new modules. We describe a
                 number of algorithmic aspects of good defragmentation,
                 and present an optimization method based on tabu
                 search. Experimental results indicate that we can
                 improve the quality of module layout by roughly 50\%
                 over the static layout. Among other benefits, this
                 improvement avoids unnecessary rejections of modules.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Cheng:2012:STP,
  author =       "Lerong Cheng and Wenyao Xu and Fang Gong and Yan Lin
                 and Ho-Yan Wong and Lei He",
  title =        "Statistical Timing and Power Optimization of
                 Architecture and Device for {FPGAs}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2209285.2209288",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Process variation in nanometer technology is becoming
                 an important issue for cutting-edge FPGAs with a
                 multimillion gate capacity. Considering both die-to-die
                 and within-die variations in effective channel length,
                 threshold voltage, and gate oxide thickness, we first
                 develop closed-form models of chip-level FPGA leakage
                 and timing variations. Experiments show that the mean
                 and standard deviation computed by our models are
                 within 3\% from those computed by Monte Carlo
                 simulation. We also observe that the leakage and timing
                 variations can be up to 3X and 1.9X, respectively. We
                 then derive analytical yield models considering both
                 leakage and timing variations, and use such models to
                 evaluate the performance of FPGA device and
                 architecture considering process variations. Compared
                 to the baseline, which uses the VPR architecture and
                 device setup based on the ITRS roadmap, device and
                 architecture tuning improves leakage yield by 10.4\%,
                 timing yield by 5.7\%, and leakage and timing combined
                 yield by 9.4\%. We also observe that LUT size of 4
                 gives the highest leakage yield, LUT size of 7 gives
                 the highest timing yield, but LUT size of 5 achieves
                 the maximum leakage and timing combined yield. To the
                 best of our knowledge, this is the first in-depth study
                 on FPGA architecture and device coevaluation
                 considering process variation.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Martin:2012:CPA,
  author =       "Kevin Martin and Christophe Wolinski and Krzysztof
                 Kuchcinski and Antoine Floch and Fran{\c{c}}ois
                 Charot",
  title =        "Constraint Programming Approach to Reconfigurable
                 Processor Extension Generation and Application
                 Compilation",
  journal =      j-TRETS,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2209285.2209289",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:43 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we present a constraint programming
                 approach for solving hard design problems present when
                 automatically designing specialized processor
                 extensions. Specifically, we discuss our approach for
                 automatic selection and synthesis of processor
                 extensions as well as efficient application compilation
                 for these newly generated extensions. The discussed
                 approach is implemented in our integrated design
                 framework, IFPEC, built using Constraint Programming
                 (CP). In our framework, custom instructions,
                 implemented as processor extensions, are defined as
                 computational patterns and represented as graphs. This,
                 along with the graph representation of an application,
                 provides a way to use our CP framework equipped with
                 subgraph isomorphism and connected component
                 constraints for identification of processor extensions
                 as well as their selection, application scheduling,
                 binding, and routing. All design steps assume
                 architectures composed of runtime reconfigurable cells,
                 implementing selected extensions, tightly connected to
                 a processor. An advantage of our approach is the
                 possibility of combining different heterogeneous
                 constraints to represent and solve all our design
                 problems. Moreover, the flexibility and expressiveness
                 of the CP framework makes it possible to solve
                 simultaneously extension selection, application
                 scheduling, and binding and improve the quality of the
                 generated results. The article is largely illustrated
                 with experimental results.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Hubner:2012:ISI,
  author =       "Michael H{\"u}bner",
  title =        "Introduction to the Special Issue on {ReCoSoC 2011}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362375",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Shield:2012:ACC,
  author =       "John Shield and Jean-Philippe Diguet and Guy Gogniat",
  title =        "Asymmetric Cache Coherency: Policy Modifications to
                 Improve Multicore Performance",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362376",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Asymmetric coherency is a new optimization method for
                 coherency policies to support nonuniform workloads in
                 multicore processors. Asymmetric coherency assists in
                 load balancing a workload and this is applicable to SoC
                 multicores where the applications are not evenly spread
                 among the processors and customization of the coherency
                 is possible. Asymmetric coherency is a policy change,
                 and consequently our designs require little or no
                 additional hardware over an existing system. We explore
                 two different types of asymmetric coherency policies.
                 Our bus-based asymmetric coherency policy, generated a
                 60\% coherency cost reduction (reduction of latencies
                 due to coherency messages) for nonshared data. Our
                 directory-based asymmetric coherency policy, showed up
                 to a 5.8\% execution time improvement and up to a 22\%
                 improvement in average memory latency for the parallel
                 benchmarks Sha, using a statically allocated asymmetry.
                 Dynamically allocated asymmetry was found to generate
                 further improvements in access latency, increasing the
                 effectiveness of asymmetric coherency by up to 73.8\%
                 when compared to the static asymmetric solution.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Thielmann:2012:MLH,
  author =       "Benjamin Thielmann and Jens Huthmann and Andreas
                 Koch",
  title =        "Memory Latency Hiding by Load Value Speculation for
                 Reconfigurable Computers",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362377",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Load value speculation has long been proposed as a
                 method to hide the latency of memory accesses. It has
                 seen very limited use in actual processors, often due
                 to the high overhead of reexecuting misspeculated
                 computations. We present PreCoRe, a framework capable
                 of generating application-specific microarchitectures
                 supporting load value speculation on reconfigurable
                 computers. The article examines the lightweight
                 speculation and replay mechanisms, the architecture of
                 the actual data value prediction units as well as the
                 impact on the nonspeculative parts of the memory
                 system. In experiments, using PreCoRe has achieved
                 speedups of up to 2.48 times over nonspeculative
                 implementations.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Gantel:2012:ERP,
  author =       "Laurent Gantel and Amel Khiar and Benoit Miramond and
                 Mohamed El Amine Benkhelifa and Lounis Kessal and
                 Fabrice Lemonnier and Jimmy Le Rhun",
  title =        "Enhancing Reconfigurable Platforms Programmability for
                 Synchronous Data-Flow Applications",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362378",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Recent FPGAs allow the design of efficient and complex
                 Heterogeneous Systems-on-Chip (HSoC). Namely, these
                 systems are composed of several processors, hardware
                 accelerators as well as communication media between all
                 these components. Performances provided by HSoCs make
                 them really interesting for data-flow applications,
                 especially image processing applications. The use of
                 this kind of architecture provides good performances
                 but the drawback is an increase of the programming
                 complexity. This complexity is due to the heterogeneous
                 deployment of the application on the platform. Some
                 functions are implemented in software to run on a
                 processor, whereas other functions are implemented in
                 hardware to run in a reconfigurable partition of the
                 FPGA. This article aims to define a programming model
                 based on the Synchronous Data-Flow model, in order to
                 abstract the heterogeneity of the implementation and to
                 leverage the communication issue between software and
                 hardware actors.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Lusala:2012:STB,
  author =       "Angelo Kuti Lusala and Jean-Didier Legat",
  title =        "A {SDM--TDM}-Based Circuit-Switched Router for On-Chip
                 Networks",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362379",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article proposes a circuit-switched router that
                 combines Spatial Division Multiplexing (SDM) and Time
                 Division Multiplexing (TDM) in order to increase path
                 diversity in the router while sharing channels among
                 multiple connections. In this way, the probability of
                 establishing paths through the network is increased,
                 thereby significantly reducing contention in the
                 network. Furthermore, Quality of Service (QoS) is
                 easily guaranteed. The proposed router was synthesized
                 on an Stratix III 3SL340F FPGA device. A 4 $ \times $ 4
                 2D Mesh SDM-TDM Network-on-Chip (NoC) was built with
                 the proposed router and synthesized on the 3SL340F FPGA
                 device. The 4 $ \times $ 4 2D Mesh SDM-TDM NoC was used
                 to build on an FPGA device, a Multiprocessor
                 System-on-Chip (MPSoC) platform consisted of 16 Nios
                 II/f processors, 16 20-KB On-chip Memories, and 16
                 Network Interfaces. Synthesis results of the MPSoC
                 platform show that the proposed router architecture can
                 be used to built large practicable MPSoC platforms with
                 the proposed NoC architecture with a reasonable
                 hardware overhead and appreciable clock frequency.
                 Simulation results show that combining SDM and TDM
                 techniques in a router allows the highest probability
                 of establishing paths through the network.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Gaspar:2012:SEF,
  author =       "Lubos Gaspar and Viktor Fischer and Lilian Bossuet and
                 Robert Fouquet",
  title =        "Secure Extension of {FPGA} General Purpose Processors
                 for Symmetric Key Cryptography with Partial
                 Reconfiguration Capabilities",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362380",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In data security systems, general purpose processors
                 (GPPs) are often extended by a cryptographic
                 accelerator. The article presents three ways of
                 extending GPPs for symmetric key cryptography
                 applications. Proposed extensions guarantee secure key
                 storage and management even if the system is facing
                 protocol, software and cache memory attacks. The system
                 is partitioned into processor, cipher, and key memory
                 zones. The three security zones are separated at
                 protocol, system, architecture and physical levels. The
                 proposed principle was validated on Altera NIOS II,
                 Xilinx MicroBlaze and Microsemi Cortex M1 soft-core
                 processor extensions. We show that stringent separation
                 of the cipher zone is helpful for partial
                 reconfiguration of the security module, if the
                 enciphering algorithm needs to be dynamically changed.
                 However, the key zone including reconfiguration
                 controller must remain static in order to maintain the
                 high level of security required. We demonstrate that
                 the principle is feasible in partially reconfigurable
                 field programmable gate arrays (FPGAs) such as Altera
                 Stratix V or Xilinx Virtex 6 and also to some extent in
                 FPGAs featuring hardwired general purpose processors
                 such as Cortex M3 in Microsemi SmartFusion FPGA.
                 Although the three GPPs feature different data
                 interfaces, we show that the processors with their
                 extensions reach the required high security level while
                 maintaining partial reconfiguration capability.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Ost:2012:EAT,
  author =       "Luciano Ost and Sameer Varyani and Leandro Soares
                 Indrusiak and Marcelo Mandelli and Gabriel Marchesan
                 Almeida and Eduardo Wachter and Fernando Moraes and
                 Gilles Sassatelli",
  title =        "Enabling Adaptive Techniques in Heterogeneous {MPSoCs}
                 Based on Virtualization",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362381",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article explores the use of virtualization to
                 enable mechanisms like task migration and dynamic
                 mapping in heterogeneous MPSoCs, thereby targeting the
                 design of systems capable of adapt their behavior to
                 time-changing workloads. Because tasks may have to be
                 mapped to target processors with different instruction
                 set architectures, we propose the use of Low Level
                 Virtual Machine (LLVM) to postcompile the tasks at
                 runtime depending on their target processor. A novel
                 dynamic mapping heuristic is also proposed, aiming to
                 exploit the advantages of specialized processors while
                 taking into account the overheads imposed by
                 virtualization. Extensive experimental work at
                 different levels of abstraction---FPGA prototype, RTL
                 and system-level simulation---is presented to evaluate
                 the proposed techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Morgan:2012:RFL,
  author =       "Fearghal Morgan and Seamus Cawley and David Newell",
  title =        "Remote {FPGA} Lab for Enhancing Learning of Digital
                 Systems",
  journal =      j-TRETS,
  volume =       "5",
  number =       "3",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2362374.2362382",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Tue Nov 6 18:07:44 MST 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Learning in digital systems can be enhanced through
                 applying a learn-by-doing approach on practical
                 hardware systems and by using Web-based technology to
                 visualize and animate hardware behavior. The authors
                 have reported the Web-based Remote FPGA Lab (RFL) which
                 provides a novel, real-time control and visualization
                 interface to a remote, always-on FPGA hardware
                 implementation. The RFL helps students to understand
                 and reason about digital systems operation, using
                 interactive animation of signal behavior in an
                 executing digital logic system, at any level of the
                 design hierarchy. The RFL supports the creation of
                 real-time interactive digital systems teaching demos.
                 The article presents student RFL usage data and survey
                 data which highlight improved student engagement,
                 learning and achievement. The article describes the RFL
                 architecture, communication interface, Web page
                 functionality, user access administration and database
                 management. The article also describes the RFLGen
                 program, developed to automate user design integration
                 into the Xilinx ISE VHDL-based RFL project wrapper for
                 creation of FPGA configuration bitstreams and RFL
                 animations.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Krieg:2012:PMP,
  author =       "Armin Krieg and Johannes Grinschgl and Christian
                 Steger and Reinhold Weiss and Holger Bock and Josef
                 Haid",
  title =        "{POWER-MODES: POWer-EmulatoR- and MOdel-Based
                 DEpendability and Security Evaluations}",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2392616.2392617",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Innovation cycles have been shortening significantly
                 during the last years. This process puts tremendous
                 pressure on designers of embedded systems for
                 security-or reliability-critical applications. Eventual
                 design problems not detected during design time can
                 lead to lost money, confidentiality, or even loss of
                 life in extreme cases. Therefore it is of vital
                 importance to evaluate a new system for its robustness
                 against intentionally and random induced operational
                 faults. Currently this is generally done using
                 extensive simulation runs using gate-level models or
                 direct measurements on the finished silicon product.
                 These approaches either need a significant amount of
                 time and computational power for these simulations or
                 rely on existing product samples. This article presents
                 a novel system evaluation platform using power
                 emulation and fault injection techniques to provide an
                 additional tool for developers of embedded systems in
                 security-and reliability-critical fields. Faults are
                 emulated using state-of-the-art fault injection methods
                 and a flexible pattern representation approach. The
                 resulting effects of these faults on the power
                 consumption profile are estimated using
                 state-of-the-art power emulation hardware. A modular
                 system augmentation approach provides emulation
                 flexibility similar to fault simulation
                 implementations. The platform enables the efficient
                 evaluation of new hardware or software implementations
                 of critical security or reliability solutions at an
                 early development phase.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Nabina:2012:AVS,
  author =       "Atukem Nabina and Jose Luis Nunez-Yanez",
  title =        "Adaptive Voltage Scaling in a Dynamically
                 Reconfigurable {FPGA}-Based Platform",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2392616.2392618",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Power is an important issue limiting the applicability
                 of Field Programmable Gate Arrays (FPGAs) since it is
                 considered to be up to one order of magnitude higher
                 than in ASICs. Recently, dynamic reconfiguration in
                 FPGAs has emerged as a viable technique able to achieve
                 power and cost reductions by time-multiplexing the
                 required functionality at runtime. In this article, the
                 applicability of Adaptive Voltage Scaling (AVS) to
                 FPGAs is considered together with dynamic
                 reconfiguration of logic and clock management resources
                 to further improve the power profile of these devices.
                 AVS is a popular power-saving technique in ASICs that
                 enables a device to regulate its own voltage and
                 frequency based on workload, fabrication, and operating
                 conditions. The resulting processing platform exploits
                 the available application-dependent timing margins to
                 achieve a power reduction up to 85\% operating at 0.58
                 volts compared with operating at a nominal voltage of 1
                 volt. The results also show that the energy
                 requirements at 0.58 volts are approximately five times
                 lower compared with nominal voltage and this can be
                 explained by the approximate cubic relation of static
                 energy with voltage and the fact that the static
                 component dominates power consumption in the considered
                 FPGA devices.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Jacobs:2012:RFT,
  author =       "Adam Jacobs and Grzegorz Cieslewski and Alan D. George
                 and Ann Gordon-Ross and Herman Lam",
  title =        "Reconfigurable Fault Tolerance: a Comprehensive
                 Framework for Reliable and Adaptive {FPGA}-Based Space
                 Computing",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2392616.2392619",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Commercial SRAM-based, field-programmable gate arrays
                 (FPGAs) have the potential to provide space
                 applications with the necessary performance to meet
                 next-generation mission requirements. However,
                 mitigating an FPGA's susceptibility to single-event
                 upset (SEU) radiation is challenging. Triple-modular
                 redundancy (TMR) techniques are traditionally used to
                 mitigate radiation effects, but TMR incurs substantial
                 overheads such as increased area and power
                 requirements. In order to reduce these overheads while
                 still providing sufficient radiation mitigation, we
                 propose a reconfigurable fault tolerance (RFT)
                 framework that enables system designers to dynamically
                 adjust a system's level of redundancy and fault
                 mitigation based on the varying radiation incurred at
                 different orbital positions. This framework includes an
                 adaptive hardware architecture that leverages FPGA
                 reconfigurable techniques to enable significant
                 processing to be performed efficiently and reliably
                 when environmental factors permit. To accurately
                 estimate upset rates, we propose an upset rate modeling
                 tool that captures time-varying radiation effects for
                 arbitrary satellite orbits using a collection of
                 existing, publicly available tools and models. We
                 perform fault-injection testing on a prototype RFT
                 platform to validate the RFT architecture and RFT
                 performability models. We combine our RFT hardware
                 architecture and the modeled upset rates using
                 phased-mission Markov modeling to estimate
                 performability gains achievable using our framework for
                 two case-study orbits.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Cancare:2012:EHC,
  author =       "Fabio Cancare and Davide B. Bartolini and Matteo
                 Carminati and Donatella Sciuto and Marco D.
                 Santambrogio",
  title =        "On the Evolution of Hardware Circuits via
                 Reconfigurable Architectures",
  journal =      j-TRETS,
  volume =       "5",
  number =       "4",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2392616.2392620",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Sun May 5 09:22:43 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Traditionally, hardware circuits are realized
                 according to techniques that follow the classical
                 phases of design and testing. A completely new approach
                 in the creation of hardware circuits has been
                 proposed---the Evolvable Hardware (EHW) paradigm, which
                 bases the circuit synthesis on a goal-oriented
                 evolutionary process inspired by biological evolution
                 in Nature. FPGA-based approaches have emerged as the
                 main architectural solution to implement EHW systems.
                 Various EHW systems have been proposed by researchers
                 but most of them, being based on outdated chips, do not
                 take advantage of the interesting features introduced
                 in newer FPGAs. This article describes a project named
                 Hardware Evolution over Reconfigurable Architectures
                 (HERA), which aims at creating a complete and
                 performance-oriented framework for the evolution of
                 digital circuits, leveraging the reconfiguration
                 technology available in FPGAs. The project is described
                 from its birth to its current state, presenting its
                 evolutionary technique tailored for FPGA-based circuits
                 and the most recent enhancements to improve the
                 scalability with respect to problem size. The developed
                 EHW system outperforms the state of the art, proving
                 its effectiveness in evolving both standard benchmarks
                 and more complex real-world applications.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Ould-Bachir:2013:SAS,
  author =       "Tarek Ould-Bachir and Jean Pierre David",
  title =        "Self-Alignment Schemes for the Implementation of
                 Addition-Related Floating-Point Operators",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2457443.2457444",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Advances in semiconductor technology brings to the
                 market incredibly dense devices, capable of handling
                 tens to hundreds floating-point operators on a single
                 chip; so do the latest field programmable gate arrays
                 (FPGAs). In order to alleviate the complexity of
                 resorting to these devices in computationally intensive
                 applications, this article proposes hardware schemes
                 for the realization of addition-related floating-point
                 operators based on the self-alignment technique (SAT).
                 The article demonstrates that the schemes guarantee an
                 accuracy as if summation was computed accurately in the
                 precision of operator's internal mantissa, then
                 faithfully rounded to working precision. To achieve
                 such performance, the article adopts the redundant high
                 radix carry-save (HRCS) format for the rapid addition
                 of wide mantissas. Implementation results show that
                 combining the SAT and the HRCS format allows the
                 implementation of complex operators with reduced area
                 and latency, more so when a fused-path approach is
                 adopted. The article also proposes a new hardware
                 operator for performing endomorphic HRCS additions and
                 presents a new technique for speeding up the conversion
                 from the redundant HRCS to a conventional binary
                 format.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Zhang:2013:FBA,
  author =       "Yan Zhang and Fan Zhang and Zheming Jin and Jason D.
                 Bakos",
  title =        "An {FPGA-Based} Accelerator for Frequent Itemset
                 Mining",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2457443.2457445",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article we describe a Field Programmable Gate
                 Array (FPGA)-based coprocessor architecture for
                 Frequent Itemset Mining (FIM). FIM is a common data
                 mining task used to find frequently occurring subsets
                 amongst a database of sets. FIM is a nonnumerical, data
                 intensive computation and is used in machine learning
                 and computational biology. FIM is particularly
                 expensive---in terms of execution time and
                 memory---when performed on large and/or sparse
                 databases or when applied using a low appearance
                 frequency threshold. Because of this, the development
                 of increasingly efficient FIM algorithms and their
                 mapping to parallel architectures is an active field.
                 Previous attempts to accelerate FIM using FPGAs have
                 relied on performance-limiting strategies such as
                 iterative database loading and runtime logic unit
                 reconfiguration. In this article, we present a novel
                 architecture to implement Eclat, a well-known FIM
                 algorithm. Unlike previous efforts, our technique does
                 not impose limits on the maximum set size as a function
                 of available FPGA logic resources and our design scales
                 well to multiple FPGAs. In addition to a novel hardware
                 design, we also present a corresponding compression
                 scheme for intermediate results that are stored in
                 on-chip memory. On a four-FPGA board, experimental
                 results show up to 68X speedup compared to a highly
                 optimized software implementation.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Meeuws:2013:QSM,
  author =       "Roel Meeuws and S. Arash Ostadzadeh and Carlo Galuzzi
                 and Vlad Mihai Sima and Razvan Nane and Koen Bertels",
  title =        "{Quipu}: a Statistical Model for Predicting Hardware
                 Resources",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2457443.2457446",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "There has been a steady increase in the utilization of
                 heterogeneous architectures to tackle the growing need
                 for computing performance and low-power systems. The
                 execution of computation-intensive functions on
                 specialized hardware enables to achieve substantial
                 speedups and power savings. However, with a large
                 legacy code base and software engineering experts, it
                 is not at all obvious how to easily utilize these new
                 architectures. As a result, there is a need for
                 comprehensive tool support to bridge the knowledge gap
                 of many engineers as well as to retarget legacy code.
                 In this article, we present the Quipu modeling
                 approach, which consists of a set of tools and a
                 modeling methodology that can generate hardware
                 estimation models, which provide valuable information
                 for developers. This information helps to focus their
                 efforts, to partition their application, and to select
                 the right heterogeneous components. We present Quipu 's
                 capability to generate domain-specific models, that are
                 up to several times more accurate within their
                 particular domain (error: 4.6\%) as compared to
                 domain-agnostic models (error: 23\%). Finally, we show
                 how Quipu can generate models for a new toolchain and
                 platform within a few days.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{deDinechin:2013:FPE,
  author =       "Florent de Dinechin and Pedro Echeverr{\'\i}a and
                 Marisa L{\'o}pez-Vallejo and Bogdan Pasca",
  title =        "Floating-Point Exponentiation Units for Reconfigurable
                 Computing",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2457443.2457447",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The high performance and capacity of current FPGAs
                 makes them suitable as acceleration co-processors. This
                 article studies the implementation, for such
                 accelerators, of the floating-point power function
                 $x^y$ as defined by the C99 and IEEE 754-2008
                 standards, generalized here to arbitrary exponent and
                 mantissa sizes. Last-bit accuracy at the smallest
                 possible cost is obtained thanks to a careful study of
                 the various subcomponents: a floating-point logarithm,
                 a modified floating-point exponential, and a truncated
                 floating-point multiplier. A parameterized architecture
                 generator in the open-source FloPoCo project is
                 presented in details and evaluated.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Neely:2013:RTH,
  author =       "Christopher E. Neely and Gordon Brebner and Weijia
                 Shang",
  title =        "{ReShape}: Towards a High-Level Approach to Design and
                 Operation of Modular Reconfigurable Systems",
  journal =      j-TRETS,
  volume =       "6",
  number =       "1",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2457443.2457448",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:42 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The latest FPGA devices provide the headroom to
                 implement large-scale and complex systems. A key
                 requirement is the integration of modules from diverse
                 sources to promote modular design and reuse. A contrary
                 factor is that using dynamic partial reconfiguration
                 typically requires low-level planning of the system
                 implementation. In this article, we introduce ReShape:
                 a high-level approach for designing reconfigurable
                 systems by interconnecting modules, which gives a
                 ``plug and play'' look and feel, is supported by tools
                 that carry out implementation functions, and is carried
                 through to support system reconfiguration during
                 operation. The emphasis is on the inter-module
                 connections and abstracting the communication patterns
                 that are typical between modules: for example, the
                 streaming of data, or the reading and writing of data
                 to and from memory modules. The details of wiring and
                 signaling are hidden from view, via metadata associated
                 with individual modules. This setting allows system
                 reconfiguration at the module level, both by supporting
                 type checking of replacement modules and by managing
                 the overall system implementation, via metadata
                 associated with its FPGA floorplan. The methodology and
                 tools have been implemented in a prototype targeted to
                 a domain-specific setting---high-speed networking---and
                 have been validated on real telecommunications design
                 projects.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Goehringer:2013:ISS,
  author =       "Diana Goehringer and Ren{\'e} Cumplido",
  title =        "Introduction to the special section on {19th
                 Reconfigurable Architectures Workshop (RAW 2012)}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499625.2499626",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Sidiropoulos:2013:JFS,
  author =       "Harry Sidiropoulos and Kostas Siozios and Peter Figuli
                 and Dimitrios Soudris and Michael H{\"u}bner and
                 J{\"u}rgen Becker",
  title =        "{JITPR}: a framework for supporting fast application's
                 implementation onto {FPGAs}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2492185",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The execution runtime usually is a headache for
                 designers performing application mapping onto
                 reconfigurable architectures. In this article we
                 propose a methodology, as well as the supporting
                 toolset, targeting to provide fast application
                 implementation onto reconfigurable architectures with
                 the usage of a Just-In-Time (JIT) compilation
                 framework. Experimental results prove the efficiency of
                 the introduced framework, as we reduce the execution
                 runtime compared to the state-of-the-art approach on
                 average by 53.5$ \times $. Additionally, the derived
                 solutions achieve higher operation frequencies by 1.17$
                 \times $, while they also exhibit significant lower
                 fragmentation ratios of hardware resources.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Heisswolf:2013:VND,
  author =       "Jan Heisswolf and Aurang Zaib and Andreas
                 Weichslgartner and Ralf K{\"o}nig and Thomas Wild and
                 J{\"u}rgen Teich and Andreas Herkersdorf and J{\"u}rgen
                 Becker",
  title =        "Virtual networks --- distributed communication
                 resource management",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2492186",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Networks-on-Chip (NoC) enable scalability for future
                 manycore architectures, facilitating parallel
                 communication between multiple cores. Applications
                 running in parallel on a NoC-based architecture can
                 affect each other due to overlapping communication.
                 Quality-of-Service (QoS) must be supported by the
                 communication infrastructure to execute communication-,
                 real-time- and safety-critical applications on such an
                 architecture. Different strategies have been proposed
                 to provide QoS for point-to-point connections. These
                 strategies allow each node to set up a limited number
                 of connections to other nodes. In this work Virtual
                 Networks (VN) are proposed to enable QoS for regions of
                 a NoC-based architecture. Virtual Networks overcome the
                 limitation of point-to-point connections. A VN behaves
                 like an exclusive physical network. Virtual Networks
                 can be defined and configured during runtime. The size
                 of the VN region and the assigned bandwidth can be
                 adjusted depending on the application requirements.
                 Virtual Networks enable the decoupling of local from
                 global communication. Therefore, the communication of
                 the application mapped into the region is assigned to a
                 Virtual Network established in that specific region.
                 This concept targets packet-switched networks with
                 virtual channels and is realized by an intelligent
                 hardware unit that manages the virtual channel
                 reservation process at system runtime. Virtual Networks
                 can be established and administrated independent of
                 each other, enabling distributed communication resource
                 management. The proposed concept is implemented as a
                 cycle-accurate SystemC simulation model. The simulation
                 results of executing communicating graphs obtained from
                 real application highlight the usefulness of Virtual
                 Networks by showing improved throughput and reduced
                 delay in the respective scenarios. A hardware
                 implementation demonstrates a low impact on area
                 utilization and power consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Ganegedara:2013:CPA,
  author =       "Thilan Ganegedara and Viktor Prasanna",
  title =        "A comprehensive performance analysis of virtual
                 routers on {FPGA}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2492187",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Network virtualization has gained much popularity with
                 the advent of datacenter networking. The hardware
                 aspect of network virtualization, router
                 virtualization, allows network service providers to
                 consolidate network hardware, reducing equipment cost
                 and management overhead. Several approaches have been
                 proposed to achieve router virtualization to support
                 several virtual networks on a single hardware platform.
                 However, their performance has not been analyzed
                 quantitatively to understand the benefits of each
                 approach. In this work, we perform a comprehensive
                 analysis of performance of these approaches on Field
                 Programmable Gate Array (FPGA) with respect to memory
                 consumption, throughput, and power consumption.
                 Generalized versions of virtualization approaches are
                 evaluated based on post place-and-route results on a
                 state-of-the-art FPGA. Grouping of routing tables is
                 proposed as a novel approach to improve scalability
                 (i.e., the number of virtual networks hosted on a
                 single chip) of virtual routers on FPGA with respect to
                 memory requirement. Further, we employ floor-planning
                 techniques to efficiently utilize chip resources and
                 achieve high performance for virtualized, pipelined
                 architectures, resulting in 1.6$ \times $ speedup on
                 the average compared with the non-floor-planned
                 approach. The results indicate that the proposed
                 solution is able to support 100+ and 50 virtual routers
                 per chip in the near-best and near-worst case
                 scenarios, while operating at 20+ Gbps rates.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Das:2013:TDA,
  author =       "Joydip Das and Steven J. E. Wilton",
  title =        "Towards development of an analytical model relating
                 {FPGA} architecture parameters to routability",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499625.2499627",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We present an analytical model relating FPGA
                 architectural parameters to the routability of the
                 FPGA. The inputs to the model include the channel width
                 and the connection and the switch block flexibilities.
                 The output is an estimate of the proportion of nets in
                 a large circuit that can be expected to be successfully
                 routed on the FPGA. We assume that the circuit is
                 routed to the FPGA using a single-step combined
                 global/detailed router. We show that the model
                 correctly predicts routability trends. We also present
                 an example application to demonstrate that this model
                 may be a valuable tool for FPGA architects. When
                 combined with the earlier works on analytical modeling,
                 our model can be used to quickly predict the
                 routability without going through any stage of an
                 expensive CAD flow. We envisage that this model will
                 benefit FPGA architecture designers and vendors to
                 quickly evaluate FPGA routing fabrics.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Huang:2013:VHS,
  author =       "Chun-Hsian Huang and Pao-Ann Hsiung",
  title =        "Virtualizable hardware\slash software design
                 infrastructure for dynamically partially reconfigurable
                 systems",
  journal =      j-TRETS,
  volume =       "6",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2499625.2499628",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:43 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "In most existing works, reconfigurable hardware
                 modules are still managed as conventional hardware
                 devices. Further, the software reconfiguration overhead
                 incurred by loading corresponding device drivers into
                 the kernel of an operating system has been overlooked
                 until now. As a result, the enhancement of system
                 performance and the utilization of reconfigurable
                 hardware modules are still quite limited. This work
                 proposes a virtualizable hardware/software design
                 infrastructure (VDI) for dynamically partially
                 reconfigurable systems. Besides the gate-level hardware
                 virtualization provided by the partial reconfiguration
                 technology, VDI supports the device-level hardware
                 virtualization. In VDI, a reconfigurable hardware
                 module can be virtualized such that it can be accessed
                 efficiently by multiple applications in an interleaving
                 way. A Hot-Plugin Connector (HPC) replaces the
                 conventional device driver, such that it not only
                 assists the device-level hardware virtualization but
                 can also be reused across different hardware modules.
                 To facilitate hardware/software communication and to
                 enhance system scalability, the proposed VDI is
                 realized as a hierarchical design framework.
                 User-designed reconfigurable hardware modules can be
                 easily integrated into VDI, and are then executed as
                 hardware tasks in an operating system for
                 reconfigurable systems (OS4RS). A dynamically partially
                 reconfigurable network security system was designed
                 using VDI, which demonstrated a higher utilization of
                 reconfigurable hardware modules and a reduction by up
                 to 12.83\% of the processing time required by using the
                 conventional method in a dynamically partially
                 reconfigurable system.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Liu:2013:INL,
  author =       "Hanyu Liu and Senthilkumar T. Rajavel and Ali Akoglu",
  title =        "Integration of Net-Length Factor with Timing- and
                 Routability-Driven Clustering Algorithms",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2517324",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In FPGA CAD flow, the clustering stage builds the
                 foundation for placement and routing stages and affects
                 performance parameters, such as routability, delay, and
                 channel width significantly. Net sharing and
                 criticality are the two most commonly used factors in
                 clustering cost functions. With this study, we first
                 derive a third term, net-length factor, and then design
                 a generic method for integrating net length into the
                 clustering algorithms. Net-length factor enables
                 characterizing the nets based on the routing stress
                 they might cause during later stages of the CAD flow
                 and is essential for enhancing the routability of the
                 design. We evaluate the effectiveness of integrating
                 net length as a factor into the well-known timing
                 (T-VPack)-, depopulation (T-NDPack)-, and routability
                 (iRAC and T-RPack)-driven clustering algorithms.
                 Through exhaustive experimental studies, we show that
                 net-length factor consistently helps improve the
                 channel-width performance of routability-,
                 depopulation-, and timing-driven clustering algorithms
                 that do not explicitly target low fan-out nets in their
                 cost functions. Particularly, net-length factor leads
                 to average reduction in channel width for T-VPack,
                 T-RPack, and T-NDPack by 11.6\%, 10.8\%, and 14.2\%,
                 respectively, and in a majority of the cases, improves
                 the critical-path delay without increasing the array
                 size.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Mehta:2013:UGE,
  author =       "Gayatri Mehta and Carson Crawford and Xiaozhong Luo
                 and Natalie Parde and Krunalkumar Patel and Brandon
                 Rodgers and Anil Kumar Sistla and Anil Yadav and Marc
                 Reisner",
  title =        "{UNTANGLED}: a Game Environment for Discovery of
                 Creative Mapping Strategies",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2517325",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The problem of creating efficient mappings of dataflow
                 graphs onto specific architectures (i.e., solving the
                 place and route problem) is incredibly challenging. The
                 difficulty is especially acute in the area of
                 Coarse-Grained Reconfigurable Architectures (CGRAs) to
                 the extent that solving the mapping problem may remove
                 a significant bottleneck to adoption. We believe that
                 the next generation of mapping algorithms will exhibit
                 pattern recognition, the ability to learn from
                 experience, and identification of creative solutions,
                 all of which are human characteristics. This manuscript
                 describes our game UNTANGLED, developed and fine-tuned
                 over the course of a year to allow us to capture and
                 analyze human mapping strategies. It also describes our
                 results to date. We find that the mapping problem can
                 be crowdsourced very effectively, that players can
                 outperform existing algorithms, and that successful
                 player strategies share many elements in common. Based
                 on our observations and analysis, we make concrete
                 recommendations for future research directions for
                 mapping onto CGRAs.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Hormigo:2013:SRC,
  author =       "Javier Hormigo and Gabriel Caffarena and Juan P.
                 Oliver and Eduardo Boemo",
  title =        "Self-Reconfigurable Constant Multiplier for {FPGA}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2490830",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Constant multipliers are widely used in signal
                 processing applications to implement the multiplication
                 of signals by a constant coefficient. However, in some
                 applications, this coefficient remains invariable only
                 during an interval of time, and then, its value changes
                 to adapt to new circumstances. In this article, we
                 present a self-reconfigurable constant multiplier
                 suitable for LUT-based FPGAs able to reload the
                 constant in runtime. The pipelined architecture
                 presented is easily scalable to any multiplicand and
                 constant sizes, for unsigned and signed
                 representations. It can be reprogrammed in 16 clock
                 cycles, equivalent to less than 100 ns in current
                 FPGAs. This value is significantly smaller than FPGA
                 partial configuration times. The presented approach is
                 more efficient in terms of area and speed when compared
                 to generic multipliers, achieving up to 91\% area
                 reduction and up to 102\% speed improvement for the
                 case-study circuits tested. The power consumption of
                 the proposed multipliers are in the range of those of
                 slice-based multipliers provided by the vendor.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Gharibian:2013:ASL,
  author =       "Farnaz Gharibian and Lesley Shannon and Peter Jamieson
                 and Kevin Chung",
  title =        "Analyzing System-Level Information's Correlation to
                 {FPGA} Placement",
  journal =      j-TRETS,
  volume =       "6",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2501985",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:45 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "One popular placement algorithms for
                 Field-Programmable Gate Arrays (FPGAs) is called
                 Simulated Annealing (SA). This algorithm tries to
                 create a good quality placement from a flattened design
                 that no longer contains any high-level information
                 related to the original design hierarchy. Placement is
                 an NP-hard problem, and as the size and complexity of
                 designs implemented on FPGAs increases, SA does not
                 scale well to find good solutions in a timely fashion.
                 In this article, we investigate if system-level
                 information can be reconstructed from a flattened
                 netlist and evaluate how that information is realized
                 in terms of its locality in the final placement. If
                 there is a strong relationship between good quality
                 placements and system-level information, then it may be
                 possible to divide a large design into smaller
                 components and improve the time needed to create a good
                 quality placement. Our preliminary results suggest that
                 the locality property of the information embedded in
                 the system-level HDL structure (i.e. ``module'',
                 ``always'', and ``if'' statements) is greatly affected
                 by designer HDL coding style. Therefore, a
                 reconstructive algorithm, called Affinity Propagation,
                 is also considered as a possible method of generating a
                 meaningful coarse-grain picture of the design.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Plavec:2013:ETD,
  author =       "Franjo Plavec and Zvonko Vranesic and Stephen Brown",
  title =        "Exploiting Task- and Data-Level Parallelism in
                 Streaming Applications Implemented in {FPGAs}",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2535932",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article describes the design and implementation
                 of a novel compilation flow that implements circuits in
                 FPGAs from a streaming programming language. The
                 streaming language supported is called FPGA Brook and
                 is based on the existing Brook language. It allows
                 system designers to express applications in a way that
                 exposes parallelism, which can be exploited through
                 hardware implementation. FPGA Brook supports
                 replication, allowing parts of an application to be
                 implemented as multiple hardware units operating in
                 parallel. Hardware units are interconnected through
                 FIFO buffers which use the small memory modules
                 available in FPGAs. The FPGA Brook automated design
                 flow uses a source-to-source compiler, developed as a
                 part of this work, and combines it with a commercial
                 behavioral synthesis tool to generate the hardware
                 implementation. A suite of benchmark applications was
                 developed in FPGA Brook and implemented using our
                 design flow. Experimental results indicate that
                 performance of many applications scales well with
                 replication. Our benchmark applications also achieve
                 significantly better results than corresponding
                 implementations using a commercial behavioral synthesis
                 tool. We conclude that using an automated design flow
                 for implementation of streaming applications in FPGAs
                 is a promising methodology.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Ananthan:2013:RPH,
  author =       "T. Ananthan and M. V. Vaidyan",
  title =        "A Reconfigurable Parallel Hardware Implementation of
                 the Self-Tuning Regulator",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2535934",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The self-tuning regulator (STR) is a popular adaptive
                 control algorithm. A high-performance computer is
                 required for its implementation due to the heavy online
                 computational burden. To extend STR for more real-time
                 applications, a parallel hardware implementation on a
                 low-cost reconfigurable computer is presented. The
                 hardware was incorporated with multistage matrix
                 multiplication (MMM) and trace technique to enhance the
                 processing speed. This design was deeply pipelined to
                 achieve high throughput. The algorithm was prototyped
                 on a Xilinx field-programmable gate array (FPGA) device
                 with a maximum operating frequency of 210.436 MHz.
                 Application-specific integrated circuit (ASIC)
                 implementation of STR was reported.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Leow:2013:AME,
  author =       "Yoon Kah Leow and Ali Akoglu and Susan Lysecky",
  title =        "An Analytical Model for Evaluating Static Power of
                 Homogeneous {FPGA} Architectures",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2535935",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "As capacity of the field-programmable gate arrays
                 (FPGAs) continues to increase, power dissipated in the
                 logic and routing resources has become a critical
                 concern for FPGA architects. Recent studies have shown
                 that static power is fast approaching the dynamic power
                 in submicron devices. In this article, we propose an
                 analytical model for relating homogeneous
                 island-style-based FPGA architecture to static power.
                 Current FPGA power models are tightly coupled with CAD
                 tools. Our CAD-independent model captures the static
                 power for a given FPGA architecture based on estimates
                 of routing and logic resource utilizations from a
                 pre-technology mapped netlist. We observe an average
                 correlation ratio (C-Ratio) of 95\% and a minimum
                 absolute percentage error (MAPE) rate of 15\% with
                 respect to the experimental results generated by the
                 Versatile Placement Routing (VPR) tool over the MCNC
                 benchmarks. Our model offers application engineers and
                 FPGA architects the capability to evaluate the impact
                 of their design choices on static power without having
                 to go through CAD-intensive investigations.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Ben-Asher:2013:OWS,
  author =       "Yosi Ben-Asher and Ron Meldiner and Nadav Rotem",
  title =        "Optimizing Wait States in the Synthesis of Memory
                 References with Unpredictable Latencies",
  journal =      j-TRETS,
  volume =       "6",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2535936",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:46 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "We consider the problem of synthesizing circuits (from
                 C to Verilog) that are optimized to handle
                 unpredictable latencies of memory operations.
                 Unpredictable memory latencies can occur due to the use
                 of on chip caches, DRAM memory modules, buffers/queues,
                 or multiport memories. Typically, high-level synthesis
                 compilers assume fixed and known memory latencies, and
                 thus are able to schedule the code's operations
                 efficiently. The operations in the source code are
                 scheduled into states of a state machine whose states
                 will be synthesized to Verilog. The goal is to minimize
                 scheduling length by maximizing the number of
                 operations (and in particular memory operations) that
                 are executed in parallel at the same state. However,
                 with unpredictable latencies, there can be an
                 exponential number of possible orders in which these
                 parallel memory operations can terminate. Thus, in
                 order to minimize the scheduling, we need a different
                 schedule for any such order. This is not practical, and
                 we show a technique of synthesizing a compact state
                 machine that schedules only a small subset of these
                 possible termination orders. Our results show that this
                 compact state machine can improve the execution time
                 compared to a regular scheduling that waits for the
                 termination of all the active memory references in
                 every state.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kornaros:2014:DPT,
  author =       "George Kornaros and Dionisios Pnevmatikatos",
  title =        "Dynamic Power and Thermal Management of {NoC-Based}
                 Heterogeneous {MPSoCs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2567658",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Advances in silicon process technology have made it
                 possible to include multiple processor cores on a
                 single die. Billion transistor architectures usually in
                 the form of networks-on-chip present a wide range of
                 challenges in design, microarchitecture, and
                 algorithmic levels with significant impact to system
                 performance and power consumption. In this article, we
                 propose efficient methods and mechanisms that exploit a
                 heterogeneous network-on-chip (NoC) to achieve a power-
                 and thermal-aware coherent system. To this end, we
                 utilize different management techniques which employ
                 dynamic frequency scaling circuitry and power and
                 temperature sensors per node to achieve real-time
                 workload prediction and allocation at node and system
                 level by low-cost threads. The developed heterogeneous
                 multicoprocessing infrastructure is utilized to
                 evaluate diverse policies for power-aware computing in
                 terms of effectiveness and in relation to distributed
                 sensor-conscious management. The proposed
                 reconfigurable architecture supports coprocessor
                 accelerators per node, monitors the program's power
                 profile on-the-fly, and balances power and thermal
                 behavior at the NoC level. Overall, these techniques
                 form a system exploration methodology using a
                 multi-FPGA emulation platform showing a minimum
                 complexity overhead.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Iskander:2014:HLA,
  author =       "Yousef Iskander and Cameron Patterson and Stephen
                 Craven",
  title =        "High-Level Abstractions and Modular Debugging for
                 {FPGA} Design Validation",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2567662",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Design validation is the most time-consuming task in
                 the FPGA design cycle. Although manufacturers and
                 third-party vendors offer a range of tools that provide
                 visibility and control of the different stages of a
                 design, many require that the design be fully
                 re-implemented for even simple parameter modifications
                 or do not allow the design to be run at full speed.
                 Designs are typically first modeled using a high-level
                 language then later rewritten in a hardware description
                 language, first for simulation and then later modified
                 for synthesis. IP and third-party cores may differ
                 during these final two stages complicating development
                 and validation. The developed approach provides two
                 means of directly validating synthesized hardware
                 designs. The first allows the original high-level model
                 written in C or C++ to be directly coupled to the
                 synthesized hardware, abstracting away the traditional
                 gate-level view of designs. A high-level programmatic
                 interface allows the synthesized design to be validated
                 directly by the software reference model. The second
                 approach provides an alternative view to FPGAs within
                 the scope of a traditional software debugger. This
                 debug framework leverages partially reconfigurable
                 regions to accelerate the modification of dynamic,
                 software-like breakpoints for low-level analysis and
                 provides a automatable, scriptable, command-line
                 interface directly to a running design on an FPGA.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Jin:2014:FAS,
  author =       "Minxi Jin and Tsutomu Maruyama",
  title =        "Fast and Accurate Stereo Vision System on {FPGA}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2567659",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In this article, we present a fast and high quality
                 stereo matching algorithm on FPGA using cost
                 aggregation (CA) and fast locally consistent (FLC)
                 dense stereo. In many software programs, global
                 matching algorithms are used in order to obtain
                 accurate disparity maps. Although their error rates are
                 considerably low, their processing speeds are far from
                 that required for real-time processing because of their
                 complex processing sequences. In order to realize
                 real-time processing, many hardware systems have been
                 proposed to date. They have achieved considerably high
                 processing speeds; however, their error rates are not
                 as good as those of software programs, because simple
                 local matching algorithms have been widely used in
                 those systems. In our system, sophisticated local
                 matching algorithms (CA and FLC) that are suitable for
                 FPGA implementation are used to achieve low error rate
                 while maintaining the high processing speed. We
                 evaluate the performance of our circuit on Xilinx
                 Vertex-6 FPGAs. Its error rate is comparable to that of
                 top-level software algorithms, and its processing speed
                 is nearly 2 clock cycles per pixel, which reaches 507.9
                 fps for 640 480 pixel images.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Ulusel:2014:FDE,
  author =       "Onur Ulusel and Kumud Nepal and R. Iris Bahar and
                 Sherief Reda",
  title =        "Fast Design Exploration for Performance, Power and
                 Accuracy Tradeoffs in {FPGA-Based} Accelerators",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2567661",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "The ease-of-use and reconfigurability of FPGAs makes
                 them an attractive platform for accelerating
                 algorithms. However, accelerating becomes a challenging
                 task as the large number of possible design parameters
                 lead to different accelerator variants. In this
                 article, we propose techniques for fast design
                 exploration and multi-objective optimization to quickly
                 identify both algorithmic and hardware parameters that
                 optimize these accelerators. This information is used
                 to run regression analysis and train mathematical
                 models within a nonlinear optimization framework to
                 identify the optimal algorithm and design parameters
                 under various objectives and constraints. To automate
                 and improve the model generation process, we propose
                 the use of L$_1$ -regularized least squares regression
                 techniques.We implement two real-time image processing
                 accelerators as test cases: one for image deblurring
                 and one for block matching. For these designs, we
                 demonstrate that by sampling only a small fraction of
                 the design space (0.42\% and 1.1\%), our modeling
                 techniques are accurate within 2\%--4\% for area and
                 throughput, 8\%--9\% for power, and 5\%--6\% for
                 arithmetic accuracy. We show speedups of 340$ \times $
                 and 90$ \times $ in time for the test cases compared to
                 brute-force enumeration. We also identify the optimal
                 set of parameters for a number of scenarios (e.g.,
                 minimizing power under arithmetic inaccuracy bounds).",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Kim:2014:FPF,
  author =       "Lok-Won Kim and Sameh Asaad and Ralph Linsker",
  title =        "A Fully Pipelined {FPGA} Architecture of a Factored
                 Restricted {Boltzmann} Machine Artificial Neural
                 Network",
  journal =      j-TRETS,
  volume =       "7",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2539125",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Thu Mar 13 08:09:47 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Artificial neural networks (ANNs) are a natural target
                 for hardware acceleration by FPGAs and GPGPUs because
                 commercial-scale applications can require days to weeks
                 to train using CPUs, and the algorithms are highly
                 parallelizable. Previous work on FPGAs has shown how
                 hardware parallelism can be used to accelerate a
                 ``Restricted Boltzmann Machine'' (RBM) ANN algorithm,
                 and how to distribute computation across multiple
                 FPGAs. Here we describe a fully pipelined parallel
                 architecture that exploits ``mini-batch'' training
                 (combining many input cases to compute each set of
                 weight updates) to further accelerate ANN training. We
                 implement on an FPGA, for the first time to our
                 knowledge, a more powerful variant of the basic RBM,
                 the ``Factored RBM'' (fRBM). The fRBM has proved
                 valuable in learning transformations and in discovering
                 features that are present across multiple types of
                 input. We obtain (in simulation) a 100-fold
                 acceleration (vs. CPU software) for an fRBM having N =
                 256 units in each of its four groups (two input, one
                 output, one intermediate group of units) running on a
                 Virtex-6 LX760 FPGA. Many of the architectural features
                 we implement are applicable not only to fRBMs, but to
                 basic RBMs and other ANN algorithms more broadly.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Luu:2014:VNG,
  author =       "Jason Luu and Jeffrey Goeders and Michael Wainberg and
                 Andrew Somerville and Thien Yu and Konstantin
                 Nasartschuk and Miad Nasr and Sen Wang and Tim Liu and
                 Nooruddin Ahmed and Kenneth B. Kent and Jason Anderson
                 and Jonathan Rose and Vaughn Betz",
  title =        "{VTR 7.0}: Next Generation Architecture and {CAD}
                 System for {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2617593",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Exploring architectures for large, modern FPGAs
                 requires sophisticated software that can model and
                 target hypothetical devices. Furthermore, research into
                 new CAD algorithms often requires a complete and open
                 source baseline CAD flow. This article describes recent
                 advances in the open source Verilog-to-Routing (VTR)
                 CAD flow that enable further research in these areas.
                 VTR now supports designs with multiple clocks in both
                 timing analysis and optimization. Hard adder/carry
                 logic can be included in an architecture in various
                 ways and significantly improves the performance of
                 arithmetic circuits. The flow now models energy
                 consumption, an increasingly important concern. The
                 speed and quality of the packing algorithms have been
                 significantly improved. VTR can now generate a netlist
                 of the final post-routed circuit which enables detailed
                 simulation of a design for a variety of purposes. We
                 also release new FPGA architecture files and models
                 that are much closer to modern commercial
                 architectures, enabling more realistic experiments.
                 Finally, we show that while this version of VTR
                 supports new and complex features, it has a 1.5$ \times
                 $ compile time speed-up for simple architectures and a
                 6$ \times $ speed-up for complex architectures compared
                 to the previous release, with no degradation to timing
                 or wire-length quality.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{J:2014:MAN,
  author =       "Soumya J. and Ashish Sharma and Santanu
                 Chattopadhyay",
  title =        "Multi-Application Network-on-Chip Design using Global
                 Mapping and Local Reconfiguration",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2556944",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "This article proposes a reconfigurable Network-on-Chip
                 (NoC) architecture based on mesh topology. It provides
                 a local reconfiguration of cores to connect to any of
                 the neighboring routers, depending upon the currently
                 executing application. The area overhead for this local
                 reconfiguration has been shown to be very small. We
                 have also presented the strategy to map the cores of an
                 application set onto this architecture. This has been
                 achieved via a two-phase procedure. In the first phase,
                 the cores of the combined application set are mapped
                 tentatively to individual routers, minimizing the
                 communication cost. In the second phase, for each
                 application, positions of individual cores are
                 finalized. A core gets attached to any neighbor of its
                 tentative allocation. We have proposed Integer Linear
                 Programming (ILP) formulation of both the phases. Since
                 ILP takes large amount of CPU time, we have also
                 formulated a Particle Swarm Optimization (PSO)-based
                 solution for the two phases. A heuristic approach has
                 also been developed for the reconfiguration. Comparison
                 of communication cost, latency and network energy have
                 been carried out for the applications, before and after
                 reconfiguration. It shows significant improvement in
                 performance via reconfiguration.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Lei:2014:FIS,
  author =       "Yuanwu Lei and Lei Guo and Yong Dou and Sheng Ma and
                 Jinbo Xu",
  title =        "{FPGA} Implementation of a Special-Purpose {VLIW}
                 Structure for Double-Precision Elementary Function",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2617594",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In the current article, the capability and flexibility
                 of field programmable gate-arrays (FPGAs) to implement
                 IEEE-754 double-precision floating-point elementary
                 functions are explored. To perform various elementary
                 functions on the unified hardware efficiently, we
                 propose a special-purpose very long instruction word
                 (VLIW) processor, called DP_VELP. This processor is
                 equipped with multiple basic units, and its performance
                 is improved through an explicitly parallel technique.
                 Pipelined evaluation of polynomial approximation with
                 Estrin's scheme is proposed, by scheduling basic
                 components in an optimal order to avoid data hazard
                 stalls and achieve minimal latency. The custom VLIW
                 processor can achieve high scalability. Under the
                 control of specific VLIW instructions, the basic units
                 are combined into special-purpose hardware for
                 elementary functions. Common elementary functions are
                 presented as examples to illustrate the design of
                 elementary function in DP_VELP in detail. Minimax
                 approximation scheme is used to reduce degree of
                 polynomial. Compromise between the size of lookup table
                 and the latency is discussed, and the internal
                 precision is carefully planned to guarantee accuracy of
                 the result. Finally, we create a prototype of the
                 DP_VELP unit and an FPGA accelerator based on the
                 DP_VELP unit on a Xilinx XC6VLX760 FPGA chip to
                 implement the SGP4/SDP4 application. Compared with
                 previous researches, the proposed design can achieve
                 low latency with a reasonable amount of resources and
                 evaluate a variety of elementary functions with the
                 unified hardware to satisfy the demands in scientific
                 applications. Experimental results show that the
                 proposed design guarantees more than 99\% of correct
                 rounding. Moreover, the SGP4/SDP4 accelerator, which is
                 equipped with 39 DP_VELP units and runs at 200 MHz,
                 outperforms the parallel software approach with
                 hyper-thread technology on an Intel Xeon Quad E5620 CPU
                 at 2.40 GHz by a factor of 7X.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Clemente:2014:MSA,
  author =       "Juan Antonio Clemente and Ivan Beretta and Vincenzo
                 Rana and David Atienza and Donatella Sciuto",
  title =        "A Mapping-Scheduling Algorithm for Hardware
                 Acceleration on Reconfigurable Platforms",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2611562",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Reconfigurable platforms are a promising technology
                 that offers an interesting trade-off between
                 flexibility and performance, which many recent embedded
                 system applications demand, especially in fields such
                 as multimedia processing. These applications typically
                 involve multiple ad-hoc tasks for hardware
                 acceleration, which are usually represented using
                 formalisms such as Data Flow Diagrams (DFDs), Data Flow
                 Graphs (DFGs), Control and Data Flow Graphs (CDFGs) or
                 Petri Nets. However, none of these models is able to
                 capture at the same time the pipeline behavior between
                 tasks (that therefore can coexist in order to minimize
                 the application execution time), their communication
                 patterns, and their data dependencies. This article
                 proves that the knowledge of all this information can
                 be effectively exploited to reduce the resource
                 requirements and the timing performance of modern
                 reconfigurable systems, where a set of hardware
                 accelerators is used to support the computation. For
                 this purpose, this article proposes a novel task
                 representation model, named Temporal Constrained Data
                 Flow Diagram (TCDFD), which includes all this
                 information. This article also presents a
                 mapping-scheduling algorithm that is able to take
                 advantage of the new TCDFD model. It aims at minimizing
                 the dynamic reconfiguration overhead while meeting the
                 communication requirements among the tasks.
                 Experimental results show that the presented approach
                 achieves up to 75\% of resources saving and up to 89\%
                 of reconfiguration overhead reduction with respect to
                 other state-of-the-art techniques for reconfigurable
                 platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Hoang:2014:IMD,
  author =       "Anh-Tuan Hoang and Takeshi Fujino",
  title =        "Intra-Masking Dual-Rail Memory on {LUT} Implementation
                 for {SCA}-Resistant {AES} on {FPGA}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2617595",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "In current countermeasure design trends against
                 differential power analysis (DPA), security at gate
                 level is required in addition to the security
                 algorithm. Several dual-rail pre-charge logics (DPL)
                 have been proposed to achieve this goal. Designs using
                 ASIC can attain this goal owing to its backend design
                 restrictions on placement and routing. However,
                 implementing these designs on field programmable gate
                 arrays (FPGA) without information leakage is still a
                 problem because of the difficulty involved in the
                 restrictions on placement and routing on FPGA. This
                 article describes our novel masked dual-rail
                 pre-charged memory approach, called `intra-masking
                 dual-rail memory (IMDRM) on LUT', and its
                 implementation on FPGA for Side-Channel
                 Attack-resistant (SCA-resistant) AES. In the proposed
                 design, all unsafe nodes, such as unmasking and
                 masking, and parts of dual-rail memory with unsafe
                 buses (buses that are not masked) are packed into a
                 single LUT. This makes them balanced and independent of
                 the placement and routing tools. Inputs and outputs of
                 all LUTs are masked, and so can be considered safe
                 signals. Several LUTs can be combined to create a safe
                 SBox. The design is independent of the cryptographic
                 algorithm, and hence, it can be applied to available
                 cryptographic standards such as DES or AES as well as
                 future standards. It requires no special placement or
                 route constraints in its implementation. A correlation
                 power analysis (CPA) attack on 1,000,000 traces of AES
                 implementation on FPGA showed that the secret
                 information is well protected against first-order
                 side-channel attacks. Even though the number of LUTs
                 used for memory in this implementation is seven times
                 greater than that of the conventional unprotected
                 single-rail memory table-lookup AES and three times
                 greater than the implementation based on a composite
                 field, it requires a smaller number of LUTs than all
                 other advanced SCA-resistant implementations such as
                 the wave dynamic differential logic, masked dual-rail
                 pre-charge logic, and threshold.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Becker:2014:ITS,
  author =       "Tobias Becker",
  title =        "Introduction to the {TRETS} Special Section on the
                 {Workshop on Self-Awareness in Reconfigurable Computing
                 Systems (SRCS'12)}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2611564",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Panerati:2014:CIL,
  author =       "Jacopo Panerati and Martina Maggio and Matteo
                 Carminati and Filippo Sironi and Marco Triverio and
                 Marco D. Santambrogio",
  title =        "Coordination of Independent Loops in Self-Adaptive
                 Systems",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2611563",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Nowadays, the same piece of code should run on
                 different architectures, providing performance
                 guarantees in a variety of environments and situations.
                 To this end, designers often integrate existing systems
                 with ad-hoc adaptive strategies able to tune specific
                 parameters that impact performance or energy-for
                 example, frequency scaling. However, these strategies
                 interfere with one another and unpredictable
                 performance degradation may occur due to the
                 interaction between different entities. In this
                 article, we propose a software approach to
                 reconfiguration when different strategies, called
                 loops, are encapsulated in the system and are available
                 to be activated. Our solution to loop coordination is
                 based on machine learning and it selects a policy for
                 the activation of loops inside of a system without
                 prior knowledge. We implemented our solution on top of
                 GNU/Linux and evaluated it with a significant subset of
                 the PARSEC benchmark suite.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Agne:2014:SAM,
  author =       "Andreas Agne and Markus Happe and Achim L{\"o}sch and
                 Christian Plessl and Marco Platzner",
  title =        "Self-Awareness as a Model for Designing and Operating
                 Heterogeneous Multicores",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2617596",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Self-aware computing is a paradigm for structuring and
                 simplifying the design and operation of computing
                 systems that face unprecedented levels of system
                 dynamics and thus require novel forms of adaptivity.
                 The generality of the paradigm makes it applicable to
                 many types of computing systems and, previously,
                 researchers started to introduce concepts of
                 self-awareness to multicore architectures. In our work
                 we build on a recent reference architectural framework
                 as a model for self-aware computing and instantiate it
                 for an FPGA-based heterogeneous multicore running the
                 ReconOS reconfigurable architecture and operating
                 system. After presenting the model for self-aware
                 computing and ReconOS, we demonstrate with a case study
                 how a multicore application built on the principle of
                 self-awareness, autonomously adapts to changes in the
                 workload and system state. Our work shows that the
                 reference architectural framework as a model for
                 self-aware computing can be practically applied and
                 allows us to structure and simplify the design process,
                 which is essential for designing complex future
                 computing systems.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Beckhoff:2014:DTI,
  author =       "Christian Beckhoff and Dirk Koch and Jim Torresen",
  title =        "Design Tools for Implementing Self-Aware and
                 Fault-Tolerant Systems on {FPGAs}",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2617597",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "To fully exploit the capabilities of runtime
                 reconfigurable FPGAs in self-aware systems, design
                 tools are required that exceed the capabilities of
                 present vendor design tools. Such tools must allow the
                 implementation of scalable reconfigurable systems with
                 various different partial modules that might be loaded
                 to different positions of the device at runtime. This
                 comprises several complex tasks, including
                 floorplanning, communication architecture synthesis,
                 physical constraints generation, physical
                 implementation, and timing verification all the way
                 down to the final bitstream generation. In this
                 article, we present how our GoAhead framework helps in
                 implementing self-aware systems on FPGAs with a minimum
                 of user interaction.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}

@Article{Niu:2014:SAT,
  author =       "Xinyu Niu and Qiwei Jin and Wayne Luk and Stephen
                 Weston",
  title =        "A Self-Aware Tuning and Self-Aware Evaluation Method
                 for Finite-Difference Applications in Reconfigurable
                 Systems",
  journal =      j-TRETS,
  volume =       "7",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "http://dx.doi.org/10.1145/2617598",
  ISSN =         "1936-7406 (print), 1936-7414 (electronic)",
  ISSN-L =       "1936-7406",
  bibdate =      "Mon Jun 30 18:26:23 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/trets.bib",
  abstract =     "Finite-difference methods are computationally
                 intensive and required by many applications. Parameters
                 of a finite-difference algorithm, such as grid size,
                 can be varied to generate design space which contains
                 algorithm instances with different constant
                 coefficients. An algorithm instance with specific
                 coefficients can either be mapped into general
                 operators to construct static designs, or be
                 implemented as constant-specific operators to form
                 dynamic designs, which require runtime reconfiguration
                 to update algorithm coefficients. This article proposes
                 a tuning method to explore the design space to optimise
                 both the static and the dynamic designs, and an
                 evaluation method to select the design with maximum
                 overall throughput, based on algorithm characteristics,
                 design properties, available resources and runtime data
                 size. For benchmark applications option pricing and
                 Reverse-Time Migration (RTM), over 50\% reduction in
                 resource consumption has been achieved for both static
                 designs and dynamic designs, while meeting precision
                 requirements. For a single hardware implementation, the
                 RTM design optimised with the proposed approach is
                 expected to run 1.8 times faster than the best
                 published design. The tuned static designs run
                 thousands of times faster than the dynamic designs for
                 algorithms with small data size, while the tuned
                 dynamic designs achieve up to 5.9 times speedup over
                 the corresponding static designs for large-scale
                 finite-difference algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Reconfigurable Technology and
                 Systems (TRETS)",
  journal-URL =  "http://portal.acm.org/toc.cfm?id=J1151",
}